1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23 24import argparse 25from collections import OrderedDict 26import nir_algebraic 27from nir_opcodes import type_sizes 28import itertools 29import struct 30from math import pi 31import math 32 33# Convenience variables 34a = 'a' 35b = 'b' 36c = 'c' 37d = 'd' 38e = 'e' 39NAN = math.nan 40 41has_fmulz = '(options->has_fmulz || \ 42 (options->has_fmulz_no_denorms && \ 43 !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))' 44 45ignore_exact = nir_algebraic.ignore_exact 46 47# Written in the form (<search>, <replace>) where <search> is an expression 48# and <replace> is either an expression or a value. An expression is 49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 50# where each source is either an expression or a value. A value can be 51# either a numeric constant or a string representing a variable name. 52# 53# If the opcode in a search expression is prefixed by a '~' character, this 54# indicates that the operation is inexact. Such operations will only get 55# applied to SSA values that do not have the exact bit set. This should be 56# used by by any optimizations that are not bit-for-bit exact. It should not, 57# however, be used for backend-requested lowering operations as those need to 58# happen regardless of precision. 59# 60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 61# "#" indicates that the given variable will only match constants, 62# type indicates that the given variable will only match values from ALU 63# instructions with the given output type, 64# (cond) specifies an additional condition function (see nir_search_helpers.h), 65# swiz is a swizzle applied to the variable (only in the <replace> expression) 66# 67# For constants, you have to be careful to make sure that it is the right 68# type because python is unaware of the source and destination types of the 69# opcodes. 70# 71# All expression types can have a bit-size specified. For opcodes, this 72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 73# type and size. In the search half of the expression this indicates that it 74# should only match that particular bit-size. In the replace half of the 75# expression this indicates that the constructed value should have that 76# bit-size. 77# 78# If the opcode in a replacement expression is prefixed by a '!' character, 79# this indicated that the new expression will be marked exact. 80# 81# A special condition "many-comm-expr" can be used with expressions to note 82# that the expression and its subexpressions have more commutative expressions 83# than nir_replace_instr can handle. If this special condition is needed with 84# another condition, the two can be separated by a comma (e.g., 85# "(many-comm-expr,is_used_once)"). 86# 87# Another set of special "conditions" are 88# "nsz": sign of zero is not preserved 89# "ninf": infinities are not preserved 90# "nnan": nan is not preserved 91# These relate to the float controls/fpfastmath and more descriptions of the 92# expression than conditions. That is, an expression with the "nsz" condition 93# means that the replacement expression won't preserve the sign of zero of the 94# result, and so it will be skipped if the matching instruction has the 95# 'signed_zero_preserve' flag set. 96 97# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 98def lowered_sincos(c): 99 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 100 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 101 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 102 103def intBitsToFloat(i): 104 return struct.unpack('!f', struct.pack('!I', i))[0] 105 106# Takes a pattern as input and returns a list of patterns where each 107# pattern has a different permutation of fneg/fabs(value) as the replacement 108# for the key operands in replacements. 109def add_fabs_fneg(pattern, replacements, commutative = True): 110 def to_list(pattern): 111 return [to_list(i) if isinstance(i, tuple) else i for i in pattern] 112 113 def to_tuple(pattern): 114 return tuple(to_tuple(i) if isinstance(i, list) else i for i in pattern) 115 116 def replace_varible(pattern, search, replace): 117 for i in range(len(pattern)): 118 if pattern[i] == search: 119 pattern[i] = replace 120 elif isinstance(pattern[i], list): 121 replace_varible(pattern[i], search, replace) 122 123 if commutative: 124 perms = itertools.combinations_with_replacement(range(4), len(replacements)) 125 else: 126 perms = itertools.product(range(4), repeat=len(replacements)) 127 128 result = [] 129 130 for perm in perms: 131 curr = to_list(pattern) 132 133 for i, (search, base) in enumerate(replacements.items()): 134 if perm[i] == 0: 135 replace = ['fneg', ['fabs', base]] 136 elif perm[i] == 1: 137 replace = ['fabs', base] 138 elif perm[i] == 2: 139 replace = ['fneg', base] 140 elif perm[i] == 3: 141 replace = base 142 143 replace_varible(curr, search, replace) 144 145 result.append(to_tuple(curr)) 146 return result 147 148 149optimizations = [ 150 # These will be recreated by late_algebraic if supported. 151 # Lowering here means we don't have to duplicate all other optimization patterns. 152 (('fgeu', a, b), ('inot', ('flt', a, b))), 153 (('fltu', a, b), ('inot', ('fge', a, b))), 154 (('fneo', 0.0, a), ('flt', 0.0, ('fabs', a))), 155 (('fequ', 0.0, a), ('inot', ('flt', 0.0, ('fabs', a)))), 156 157 158 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 159 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 160 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 161 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 162 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 163 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 164 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 165 166 (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 167 '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 168 169 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 170 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 171 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 172 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 173 (('udiv', a, 1), a), 174 (('idiv', a, 1), a), 175 (('umod', a, 1), 0), 176 (('imod', a, 1), 0), 177 (('imod', a, -1), 0), 178 (('irem', a, 1), 0), 179 (('irem', a, -1), 0), 180 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 181 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 182 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 183 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 184 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 185 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 186 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 187 (('irem', a, '#b(is_pos_power_of_two)'), 188 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 189 '!options->lower_bitops'), 190 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 191 192 (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'), 193 194 (('~fneg', ('fneg', a)), a), 195 (('ineg', ('ineg', a)), a), 196 (('fabs', ('fneg', a)), ('fabs', a)), 197 (('fabs', ('u2f', a)), ('u2f', a)), 198 (('iabs', ('iabs', a)), ('iabs', a)), 199 (('iabs', ('ineg', a)), ('iabs', a)), 200 (('~fadd', a, 0.0), a), 201 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 202 # floating point instruction, they should flush any input denormals and we 203 # can replace -0.0 with 0.0 if the float execution mode allows it. 204 (('fadd(is_only_used_as_float,nsz)', 'a', 0.0), a), 205 (('fadd(is_only_used_as_float)', a, '#b(is_negative_zero)'), a), 206 (('fadd', ('fneg', a), '#b(is_negative_zero)'), ('fneg', a)), 207 (('iadd', a, 0), a), 208 (('iadd_sat', a, 0), a), 209 (('isub_sat', a, 0), a), 210 (('uadd_sat', a, 0), a), 211 (('usub_sat', a, 0), a), 212 (('usadd_4x8_vc4', a, 0), a), 213 (('usadd_4x8_vc4', a, ~0), ~0), 214 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 215 (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 216 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 217 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 218 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 219 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 220 (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 221 (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 222 (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 223 (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 224 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 225 (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)), 226 (('iand', ('iand', a, b), ('iand(is_used_once)', a, c)), ('iand', ('iand', a, b), c)), 227 (('ior', ('ior', a, b), ('ior(is_used_once)', a, c)), ('ior', ('ior', a, b), c)), 228 (('iand', ('ior(is_used_once)', a, b), ('ior(is_used_once)', a, c)), ('ior', a, ('iand', b, c))), 229 (('ior', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), ('iand', a, ('ior', b, c))), 230 # (a & b) | (a | c) => ((a & b) | a) | c => a | c 231 (('ior', ('iand', a, b), ('ior', a, c)), ('ior', a, c)), 232 # (a & b) & (a | c) => b & (a & (a | c)) => b & a 233 (('iand', ('iand', a, b), ('ior', a, c)), ('iand', a, b)), 234 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)), 235 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)), 236 (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))), 237 (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))), 238 (('~fadd', ('fneg', a), a), 0.0), 239 (('iadd', ('ineg', a), a), 0), 240 (('iadd', ('ineg', a), ('iadd', a, b)), b), 241 (('iadd', a, ('iadd', ('ineg', a), b)), b), 242 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 243 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 244 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 245 (('fadd', a, a), ('fmul', a, 2.0)), 246 (('~fmul', a, 0.0), 0.0), 247 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 248 (('fmul(nsz,nnan)', 'a', 0.0), 0.0), 249 (('fmulz', a, 0.0), 0.0), 250 (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)), 251 (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 252 (('fmulz', a, a), ('fmul', a, a)), 253 (('ffmaz(nsz)', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c)), 254 (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 255 (('ffmaz', a, a, b), ('ffma', a, a, b)), 256 (('imul', a, 0), 0), 257 (('umul_unorm_4x8_vc4', a, 0), 0), 258 (('umul_unorm_4x8_vc4', a, ~0), a), 259 (('~fmul', a, 1.0), a), 260 (('~fmulz', a, 1.0), a), 261 # The only effect a*1.0 can have is flushing denormals. If it's only used by 262 # a floating point instruction, they should flush any input denormals and 263 # this multiplication isn't needed. 264 (('fmul(is_only_used_as_float)', a, 1.0), a), 265 (('imul', a, 1), a), 266 (('fmul', a, -1.0), ('fneg', a)), 267 (('imul', a, -1), ('ineg', a)), 268 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 269 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 270 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 271 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 272 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 273 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 274 (('~ffma', 0.0, a, b), b), 275 (('ffma(is_only_used_as_float,nsz,nnan,ninf)', 0.0, a, b), b), 276 (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 277 (('~ffma', a, b, 0.0), ('fmul', a, b)), 278 (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)), 279 (('ffmaz(nsz)', a, b, 0.0), ('fmulz', a, b)), 280 (('ffma', 1.0, a, b), ('fadd', a, b)), 281 (('ffmaz(nsz)', 1.0, a, b), ('fadd', a, b)), 282 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 283 (('ffmaz(nsz)', -1.0, a, b), ('fadd', ('fneg', a), b)), 284 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 285 (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 286 (('~flrp', a, b, 0.0), a), 287 (('~flrp', a, b, 1.0), b), 288 (('~flrp', a, a, b), a), 289 (('~flrp', 0.0, a, b), ('fmul', a, b)), 290 291 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 292 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 293 294 (('sdot_4x8_iadd', a, 0, b), b), 295 (('udot_4x8_uadd', a, 0, b), b), 296 (('sdot_4x8_iadd_sat', a, 0, b), b), 297 (('udot_4x8_uadd_sat', a, 0, b), b), 298 (('sdot_2x16_iadd', a, 0, b), b), 299 (('udot_2x16_uadd', a, 0, b), b), 300 (('sdot_2x16_iadd_sat', a, 0, b), b), 301 (('udot_2x16_uadd_sat', a, 0, b), b), 302 303 # sudot_4x8_iadd is not commutative at all, so the patterns must be 304 # duplicated with zeros on each of the first positions. 305 (('sudot_4x8_iadd', a, 0, b), b), 306 (('sudot_4x8_iadd', 0, a, b), b), 307 (('sudot_4x8_iadd_sat', a, 0, b), b), 308 (('sudot_4x8_iadd_sat', 0, a, b), b), 309 310 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 311 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 312 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 313 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 314 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 315 316 # Try to let constant folding eliminate the dot-product part. These are 317 # safe because the dot product cannot overflow 32 bits. 318 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 319 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 320 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 321 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 322 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 323 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 324 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 325 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 326 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 327 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 328 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 329 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 330 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 331 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 332 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 333 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 334 335 # Optimize open-coded fmulz. 336 # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 337 *add_fabs_fneg((('fmul@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb')), 338 ('fmulz', 'ma', 'mb'), has_fmulz), {'ma' : a, 'mb' : b}), 339 *add_fabs_fneg((('fmul@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 340 ('fmulz', 'ma', b), has_fmulz), {'ma' : a}), 341 342 # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 343 *add_fabs_fneg((('ffma@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb'), c), 344 ('ffmaz', 'ma', 'mb', c), has_fmulz), {'ma' : a, 'mb' : b}), 345 *add_fabs_fneg((('ffma@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 346 ('ffmaz', 'ma', b, c), has_fmulz), {'ma' : a}), 347 348 # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b)) 349 *add_fabs_fneg((('bcsel(nsz,nnan,ninf)', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, 'mb'))), 350 ('fexp2', ('fmulz', a, 'mb')), 351 has_fmulz), {'mb': b}), 352 *add_fabs_fneg((('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmulz', a, 'mb'))), 353 ('fexp2', ('fmulz', a, 'mb'))), {'mb': b}), 354] 355 356# Bitwise operations affecting the sign may be replaced by equivalent 357# floating point operations, except possibly for denormal 358# behaviour hence the is_only_used_as_float. 359for sz in (16, 32, 64): 360 sign_bit = 1 << (sz - 1) 361 362 optimizations.extend([ 363 (('iand(is_only_used_as_float)', f'a@{sz}', sign_bit - 1), ('fabs', a)), 364 (('ixor(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', a)), 365 (('ior(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', ('fabs', a))), 366 ]) 367 368# Shorthand for the expansion of just the dot product part of the [iu]dp4a 369# instructions. 370sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 371 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 372 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 373 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 374udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 375 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 376 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 377 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 378sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 379 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 380 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 381 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 382sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 383 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 384udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 385 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 386 387optimizations.extend([ 388 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 389 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 390 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 391 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 392 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 393 394 # For the unsigned dot-product, the largest possible value 4*(255*255) = 395 # 0x3f804, so we don't have to worry about that intermediate result 396 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 397 # that is less than 0xfffc07fc, then the result cannot overflow ever. 398 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 399 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'), 400 401 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 402 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 403 # don't have to worry about that intermediate result overflowing or 404 # underflowing. 405 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'), 406 407 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'), 408 409 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 410 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 411]) 412 413# Float sizes 414for s in [16, 32, 64]: 415 optimizations.extend([ 416 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 417 418 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 419 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 420 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 421 422 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 423 # These are the same as the previous three rules, but it depends on 424 # 1-fsat(x) <=> fsat(1-x). See below. 425 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 426 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 427 428 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 429 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 430 431 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 432 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 433 434 # These two aren't flrp lowerings, but do appear in some shaders. 435 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 436 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 437 438 # 1 - ((1 - a) * (1 - b)) 439 # 1 - (1 - a - b + a*b) 440 # 1 - 1 + a + b - a*b 441 # a + b - a*b 442 # a + b*(1 - a) 443 # b*(1 - a) + 1*a 444 # flrp(b, 1, a) 445 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 446 ]) 447 448optimizations.extend([ 449 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 450 451 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 452 453 # D3D9 vertex shader trunc 454 (('fadd', ('ffloor', a), ('b2f', ('iand', ('flt', a, 0), ('flt', ('fneg', ('ffract', a)), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))), 455 # D3D9 pixel shader trunc 456 (('fadd', ('ffloor', a), ('b2f', ('inot', ('fge', 0, ('fmin', ('fneg', a), ('ffract', a)))))), ('ftrunc', ('fadd', a, 0))), 457 (('fadd', ('ffloor', a), ('b2f', ('flt', 0, ('fmin', ('fneg', a), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))), 458 459 (('fadd(nnan,nsz)', a, ('ffract', ('fneg', a))), ('fceil', a), '!options->lower_fceil'), 460 461 (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 462 (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 463 (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 464 '(options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)) && (!(options->lower_doubles_options & nir_lower_dfloor) || !(options->lower_doubles_options & nir_lower_dfract))'), 465 466 (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 467 (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 468 (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 469 (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'), 470 (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'), 471 (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 472 (('fadd@16(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 473 (('fadd@32(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 474 (('fadd@64(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 475 (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 476 (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 477 (('ffract@64', a), ('fsub', a, ('ffloor', a)), 478 '(options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)) && !(options->lower_doubles_options & nir_lower_dfloor)'), 479 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 480 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 481 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 482 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 483 (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 484 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 485 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 486 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 487 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 488 (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 489 490 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 491 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 492 493 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 494 495 (('fdot4', a, 0.0), 0.0), 496 (('fdot3', a, 0.0), 0.0), 497 (('fdot2', a, 0.0), 0.0), 498 499 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 500 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 501 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 502 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 503 504 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 505 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 506 507 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 508 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 509 510 # Lower fdot to fsum when it is available 511 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 512 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 513 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 514 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 515 516 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 517 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 518 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 519 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 520 521 # (a * #b + #c) << #d 522 # ((a * #b) << #d) + (#c << #d) 523 # (a * (#b << #d)) + (#c << #d) 524 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 525 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 526 527 # (a * #b) << #c 528 # a * (#b << #c) 529 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 530]) 531 532# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 533# bits of the second source. These replacements must correctly handle the 534# case where (b % bitsize) + (c % bitsize) >= bitsize. 535for s in [8, 16, 32, 64]: 536 mask = s - 1 537 538 ishl = "ishl@{}".format(s) 539 ishr = "ishr@{}".format(s) 540 ushr = "ushr@{}".format(s) 541 542 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 543 544 optimizations.extend([ 545 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 546 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 547 548 # To get get -1 for large shifts of negative values, ishr must instead 549 # clamp the shift count to the maximum value. 550 ((ishr, (ishr, a, '#b'), '#c'), 551 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 552 ]) 553 554# Optimize a pattern of address calculation created by DXVK where the offset is 555# divided by 4 and then multipled by 4. This can be turned into an iand and the 556# additions before can be reassociated to CSE the iand instruction. 557 558for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 559 a_sz = 'a@{}'.format(size) 560 561 optimizations.extend([ 562 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 563 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 564 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 565 566 # This does not trivially work with ishr. 567 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 568 ]) 569 570# Collapses ubfe(ubfe(a, b, c), d, e) when b, c, d, e are constants. 571def ubfe_ubfe(a, b, c, d, e): 572 inner_offset = ('iand', b, 0x1f) 573 inner_bits = ('umin', ('iand', c, 0x1f), ('isub', 32, inner_offset)) 574 outer_offset = ('iand', d, 0x1f) 575 outer_bits = ('iand', e, 0x1f) 576 577 offset = ('iadd', inner_offset, outer_offset) 578 bits = ('umin', outer_bits, ('imax', ('isub', inner_bits, outer_offset), 0)) 579 collapsed = ('ubfe', a, offset, bits) 580 offset_out_of_range = ('ilt', 31, offset) 581 582 # This will be constant-folded to either 0 or the collapsed ubfe, 583 # whose offset and bits operands will also be constant folded. 584 return ('bcsel', offset_out_of_range, 0, collapsed) 585 586optimizations.extend([ 587 # Create bitfield extract from right-shift + and pattern. 588 (('iand@32', ('ushr@32(is_used_once)', a, b), '#c(is_const_bitmask)'), 589 ('ubfe', a, b, ('bit_count', c)), 590 'options->has_bfe && !options->avoid_ternary_with_two_constants'), 591 592 (('iand@32', ('ushr@32', a, b), ('bfm', c, 0)), 593 ('ubfe', a, b, c), 'options->has_bfe'), 594 595 (('ushr', ('iand', a, ('bfm', c, b)), b), 596 ('ubfe', a, b, c), 'options->has_bfe'), 597 598 # Collapse two bitfield extracts with constant operands into a single one. 599 (('ubfe', ('ubfe', a, '#b', '#c'), '#d', '#e'), 600 ubfe_ubfe(a, b, c, d, e)), 601 602 # Collapse non-zero right-shift into bitfield extract. 603 (('ushr@32', ('ubfe', a, '#b', '#c'), '#d(is_5lsb_not_zero)'), 604 ubfe_ubfe(a, b, c, d, 31)), 605 606 (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 607 (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 608]) 609 610for log2 in range(1, 7): # powers of two from 2 to 64 611 v = 1 << log2 612 mask = 0xffffffff & ~(v - 1) 613 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 614 615 optimizations.extend([ 616 # Reassociate for improved CSE 617 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 618 ]) 619 620# To save space in the state tables, reduce to the set that is known to help. 621# Previously, this was range(1, 32). In addition, a couple rules inside the 622# loop are commented out. Revisit someday, probably after mesa/#2635 has some 623# resolution. 624for i in [1, 2, 16, 24]: 625 lo_mask = 0xffffffff >> i 626 hi_mask = (0xffffffff << i) & 0xffffffff 627 628 optimizations.extend([ 629 # This pattern seems to only help in the soft-fp64 code. 630 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 631# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 632# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 633 634 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 635 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 636# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 637 ]) 638 639optimizations.extend([ 640 # This is common for address calculations. Reassociating may enable the 641 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 642 # instruction or a constant offset field for in load / store instructions. 643 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 644 645 # (a + #b) * #c => (a * #c) + (#b * #c) 646 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 647 648 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 649 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 650 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 651 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 652 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 653 654 # Comparison simplifications 655 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 656 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 657 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 658 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 659 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 660 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 661 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 662 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 663 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 664 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 665 666 (('iand', ('feq', a, b), ('fneu', a, b)), False), 667 (('iand', ('flt', a, b), ('flt', b, a)), False), 668 (('iand', ('ieq', a, b), ('ine', a, b)), False), 669 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 670 (('iand', ('ult', a, b), ('ult', b, a)), False), 671 672 # This helps some shaders because, after some optimizations, they end up 673 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 674 # matching would be handled by CSE. 675 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 676 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 677 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 678 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 679 (('flt', ('fneg', 'a(is_not_const)'), '#b'), ('flt', ('fneg', b), a)), 680 (('flt', '#b', ('fneg', 'a(is_not_const)')), ('flt', a, ('fneg', b))), 681 (('fge', ('fneg', 'a(is_not_const)'), '#b'), ('fge', ('fneg', b), a)), 682 (('fge', '#b', ('fneg', 'a(is_not_const)')), ('fge', a, ('fneg', b))), 683 (('fneu', ('fneg', 'a(is_not_const)'), '#b'), ('fneu', ('fneg', b), a)), 684 (('feq', '#b', ('fneg', 'a(is_not_const)')), ('feq', a, ('fneg', b))), 685 (('flt', a, '#b(is_negative_zero)'), ('flt', a, 0.0)), 686 (('flt', '#b(is_negative_zero)', a), ('flt', 0.0, a)), 687 (('fge', a, '#b(is_negative_zero)'), ('fge', a, 0.0)), 688 (('fge', '#b(is_negative_zero)', a), ('fge', 0.0, a)), 689 (('fneu', a, '#b(is_negative_zero)'), ('fneu', 0.0, a)), 690 (('feq', '#b(is_negative_zero)', a), ('feq', a, 0.0)), 691 692 (('ieq', ('ineg', a), 0), ('ieq', a, 0)), 693 (('ine', ('ineg', a), 0), ('ine', a, 0)), 694 (('ieq', ('iabs', a), 0), ('ieq', a, 0)), 695 (('ine', ('iabs', a), 0), ('ine', a, 0)), 696 (('fneu', ('fabs', a), 0.0), ('fneu', a, 0.0)), 697 (('feq', ('fabs', a), 0.0), ('feq', a, 0.0)), 698 (('fneu', ('fabs', a), ('fabs', a)), ('fneu', a, a)), 699 (('feq', ('fabs', a), ('fabs', a)), ('feq', a, a)), 700 701 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 702 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 703 704 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 705 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 706 707 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 708 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 709 710 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 711 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 712 713 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 714 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 715 716 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 717 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 718 719 # 0.0 >= b2f(a) 720 # b2f(a) <= 0.0 721 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 722 # inot(a) 723 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 724 725 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 726 727 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 728 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 729 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 730 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 731 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 732 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 733 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 734 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 735 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 736 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 737 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 738 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 739 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 740 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 741 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 742 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 743 744 # -(b2f(a) + b2f(b)) < 0 745 # 0 < b2f(a) + b2f(b) 746 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 747 # a || b 748 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 749 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 750 751 # -(b2f(a) + b2f(b)) >= 0 752 # 0 >= b2f(a) + b2f(b) 753 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 754 # !(a || b) 755 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 756 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 757 758 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 759 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 760 761 # Some optimizations (below) convert things like (a < b || c < b) into 762 # (min(a, c) < b). However, this interfers with the previous optimizations 763 # that try to remove comparisons with negated sums of b2f. This just 764 # breaks that apart. 765 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 766 ('ior', ('flt', c, 0.0), ('ior', a, b))), 767 768 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 769 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 770 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 771 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 772 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 773 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 774 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 775 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 776 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 777 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 778 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 779 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 780 781 # Cannot remove the addition from ilt or ige due to overflow. 782 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 783 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 784 785 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 786 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 787 (('fneu', ('b2f', 'a@1'), 0.0), a), 788 (('flt', 0.0, ('b2f', 'a@1')), a), 789 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 790 (('ine', ('b2i', 'a@1'), 0), a), 791 (('ieq', 'a@1', False), ('inot', a)), 792 (('ieq', 'a@1', True), a), 793 (('ine', 'a@1', False), a), 794 (('ine', 'a@1', True), ('inot', a)), 795 796 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 797 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 798 (('fge', ('u2f', a), 0.0), True), 799 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 800 (('flt', ('u2f', a), 0.0), False), 801 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 802 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 803 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 804 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 805 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 806 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 807 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 808 809 # 0.0 < fabs(a) 810 # fabs(a) > 0.0 811 # fabs(a) != 0.0 because fabs(a) must be >= 0 812 # a != 0.0 813 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 814 815 # -fabs(a) < 0.0 816 # fabs(a) > 0.0 817 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 818 819 # 0.0 >= fabs(a) 820 # 0.0 == fabs(a) because fabs(a) must be >= 0 821 # 0.0 == a 822 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 823 824 # -fabs(a) >= 0.0 825 # 0.0 >= fabs(a) 826 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 827 828 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 829 # 830 # This should be NaN safe. 831 # 832 # NaN >= 0 && 1 >= NaN -> false && false -> false 833 # 834 # vs. 835 # 836 # NaN == fsat(NaN) -> NaN == 0 -> false 837 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 838 839 # Note: fmin(-a, -b) == -fmax(a, b) 840 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 841 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 842 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 843 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 844 845 # fmin(b2f(a), b) 846 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 847 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 848 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 849 # 850 # Since b is a constant, constant folding will eliminate the fmin and the 851 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 852 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 853 854 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 855 856 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 857 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 858 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 859 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 860 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 861 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 862 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 863 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 864 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 865 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 866 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 867 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 868 (('bcsel', a, True, b), ('ior', a, b)), 869 (('bcsel', a, a, b), ('ior', a, b)), 870 (('bcsel', a, b, False), ('iand', a, b)), 871 (('bcsel', a, b, a), ('iand', a, b)), 872 (('~fmin', a, a), a), 873 (('~fmax', a, a), a), 874 (('imin', a, a), a), 875 (('imax', a, a), a), 876 (('umin', a, a), a), 877 (('umin', a, 0), 0), 878 (('umin', a, -1), a), 879 (('umax', a, a), a), 880 (('umax', a, 0), a), 881 (('umax', a, -1), -1), 882 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 883 (('umax', ('umax', a, b), b), ('umax', a, b)), 884 (('imax', ('imax', a, b), b), ('imax', a, b)), 885 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 886 (('umin', ('umin', a, b), b), ('umin', a, b)), 887 (('imin', ('imin', a, b), b), ('imin', a, b)), 888 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 889 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 890 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 891 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 892 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 893 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 894 (('fmin', ('fmax', 'a(is_finite)', b), a), ('fmul', 1.0, a)), 895 (('fmax', ('fmin', 'a(is_finite)', b), a), ('fmul', 1.0, a)), 896 (('umin', ('umax', a, b), a), a), 897 (('umax', ('umin', a, b), a), a), 898 (('imin', ('imax', a, b), a), a), 899 (('imax', ('imin', a, b), a), a), 900]) 901 902for N in [8, 16, 32, 64]: 903 b2iN = 'b2i{0}'.format(N) 904 optimizations.extend([ 905 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 906 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 907 ]) 908 909for N in [16, 32, 64]: 910 b2fN = 'b2f{0}'.format(N) 911 optimizations.extend([ 912 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 913 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 914 ]) 915 916# Integer sizes 917for s in [8, 16, 32, 64]: 918 optimizations.extend([ 919 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 920 921 # Simplify logic to detect sign of an integer. 922 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 923 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 924 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 925 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 926 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 927 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 928 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 929 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 930 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 931 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 932 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 933 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 934 ]) 935 936optimizations.extend([ 937 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 938 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 939 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 940 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 941 (('~fmin', a, ('fabs', a)), a), 942 (('imin', a, ('iabs', a)), a), 943 (('~fmax', a, ('fneg', ('fabs', a))), a), 944 (('imax', a, ('ineg', ('iabs', a))), a), 945 (('fmax', a, ('fabs', a)), ('fabs', a)), 946 (('imax', a, ('iabs', a)), ('iabs', a)), 947 (('fmax', a, ('fneg', a)), ('fabs', a)), 948 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 949 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 950 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 951 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 952 # fsat(a) returns 0.0. 953 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 954 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 955 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 956 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 957 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 958 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 959 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 960 # representing this in the optimizations other than the usual ~. 961 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 962 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 963 # the new comparison precise to prevent it being changed to 'a != 0'. 964 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 965 (('fsat', ('b2f', a)), ('b2f', a)), 966 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 967 (('fsat', ('fsat', a)), ('fsat', a)), 968 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 969 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 970 (('fsat(nsz)', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat'), 971 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 972 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 973 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 974 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 975 # Both the left and right patterns are "b" when isnan(a), so this is exact. 976 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 977 (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))), 978 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 979 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 980 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 981 982 # If a >= 0 ... 1 + a >= 1 ... so fsat(1 + a) = 1 983 (('fsat', ('fadd', 1.0, 'a(is_ge_zero)')), 1.0), 984 985 # Let constant folding do its job. This can have emergent behaviour. 986 (('fneg', ('bcsel(is_used_once)', a, '#b', '#c')), ('bcsel', a, ('fneg', b), ('fneg', c))), 987 988 # max(-min(b, a), b) -> max(abs(b), -a) 989 # min(-max(b, a), b) -> min(-abs(b), -a) 990 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 991 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 992 993 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 994 # fsat(b-a). 995 # 996 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 997 # 998 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 999 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 1000 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 1001 1002 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 1003 1004 # The ior versions are exact because fmin and fmax will always pick a 1005 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 1006 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 1007 # to prevent other optimizations from ruining the "NaN clensing" property 1008 # of the fmin or fmax. 1009 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 1010 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 1011 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 1012 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 1013 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 1014 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 1015 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 1016 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 1017 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 1018 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 1019 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 1020 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 1021 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 1022 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 1023 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 1024 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 1025 1026 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 1027 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 1028 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 1029 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 1030 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 1031 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 1032 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 1033 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 1034 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 1035 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 1036 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 1037 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 1038 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 1039 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 1040 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 1041 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 1042 1043 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 1044 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 1045 # single step. Doing just the replacement can lead to an infinite loop as 1046 # the pattern is repeatedly applied to the result of the previous 1047 # application of the pattern. 1048 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 1049 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 1050 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 1051 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 1052 1053 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 1054 # numbers, then it can be replaced with fneu. 1055 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 1056 1057 # Other patterns may optimize the resulting iand tree further. 1058 (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)), 1059 ('iand', ('iand', a, b), ('iand', c, b))), 1060]) 1061 1062# Float sizes 1063for s in [16, 32, 64]: 1064 if s == 64: 1065 match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)" 1066 else: 1067 match_fsign_cond = "!options->lower_fsign" 1068 optimizations.extend([ 1069 # These derive from the previous patterns with the application of b < 0 <=> 1070 # 0 < -b. The transformation should be applied if either comparison is 1071 # used once as this ensures that the number of comparisons will not 1072 # increase. The sources to the ior and iand are not symmetric, so the 1073 # rules have to be duplicated to get this behavior. 1074 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 1075 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 1076 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1077 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1078 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1079 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1080 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1081 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1082 1083 (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1084 (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1085 (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1086 (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1087 1088 # The (i2f32, ...) part is an open-coded fsign. When that is combined 1089 # with the bcsel, it's basically copysign(1.0, a). There are some 1090 # behavior differences between this pattern and copysign w.r.t. ±0 and 1091 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 1092 # to x, regardless of whether either or both values are NaN. 1093 # 1094 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 1095 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 1096 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 1097 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 1098 # 1099 # For all other values of 'a', the original and replacement behave as 1100 # copysign. 1101 # 1102 # Marking the replacement comparisons as precise prevents any future 1103 # optimizations from replacing either of the comparisons with the 1104 # logical-not of the other. 1105 # 1106 # Note: Use b2i32 in the replacement because some platforms that 1107 # support fp16 don't support int16. 1108 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 1109 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 1110 1111 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 1112 1113 # The C spec says, "If the value of the integral part cannot be represented 1114 # by the integer type, the behavior is undefined." "Undefined" can mean 1115 # "the conversion doesn't happen at all." 1116 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 1117 1118 # Ironically, mark these as imprecise because removing the conversions may 1119 # preserve more precision than doing the conversions (e.g., 1120 # uint(float(0x81818181u)) == 0x81818200). 1121 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1122 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1123 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1124 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1125 1126 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond), 1127 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond), 1128 1129 # float? -> float? -> floatS ==> float? -> floatS 1130 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 1131 1132 # int? -> float? -> floatS ==> int? -> floatS 1133 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 1134 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 1135 1136 # float? -> float? -> intS ==> float? -> intS 1137 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 1138 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 1139 1140 # HLSL's sign function returns an integer 1141 (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)), 1142 ]) 1143 1144 for B in [32, 64]: 1145 if s < B: 1146 optimizations.extend([ 1147 # S = smaller, B = bigger 1148 # floatS -> floatB -> floatS ==> identity 1149 (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 1150 1151 # floatS -> floatB -> intB ==> floatS -> intB 1152 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 1153 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 1154 1155 # int? -> floatB -> floatS ==> int? -> floatS 1156 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 1157 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 1158 ]) 1159 1160for S in [1, 8, 16, 32]: 1161 for B in [8, 16, 32, 64]: 1162 if B <= S: 1163 continue 1164 optimizations.extend([ 1165 # intS -> intB -> intS ==> identity 1166 (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a), 1167 (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a), 1168 ]) 1169 1170 if B < 16: 1171 continue 1172 for C in [8, 16, 32, 64]: 1173 if C <= S: 1174 continue 1175 optimizations.extend([ 1176 # intS -> intC -> floatB ==> intS -> floatB 1177 (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)), 1178 (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)), 1179 ]) 1180 1181# mediump variants of the above 1182optimizations.extend([ 1183 # int32 -> float32 -> float16 ==> int32 -> float16 1184 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 1185 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 1186 1187 # float32 -> float16 -> int16 ==> float32 -> int16 1188 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 1189 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 1190 1191 # float32 -> int32 -> int16 ==> float32 -> int16 1192 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 1193 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 1194 1195 # int32 -> int16 -> float16 ==> int32 -> float16 1196 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 1197 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 1198]) 1199 1200# Clean up junk left from 8-bit integer to 16-bit integer lowering. 1201optimizations.extend([ 1202 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1203 # accomplished by mask the upper 8-bit of the immediate operand to the 1204 # iand instruction. Often times, both patterns will end up being applied 1205 # to the same original expression tree. 1206 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1207 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1208]) 1209 1210for op in ['iand', 'ior', 'ixor']: 1211 optimizations.extend([ 1212 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1213 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1214 1215 # Undistribute extract from a logic op 1216 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1217 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1218 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1219 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1220 1221 # Undistribute shifts from a logic op 1222 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1223 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1224 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1225 ]) 1226 1227# Integer sizes 1228for s in [8, 16, 32, 64]: 1229 amount_bits = int(math.log2(s)) 1230 1231 lower_umin = 'options->lower_umin' 1232 lower_umax = 'options->lower_umax' 1233 lower_imin = 'false' 1234 lower_imax = 'false' 1235 lower_ior = 'options->lower_bitops' 1236 if s == 64: 1237 lower_umin = '(options->lower_umin || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1238 lower_umax = '(options->lower_umax || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1239 lower_imin = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1240 lower_imax = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1241 lower_ior = '(options->lower_bitops || (options->lower_int64_options & nir_lower_logic64) != 0)' 1242 1243 optimizations.extend([ 1244 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), lower_umax + ' && !' + lower_ior), 1245 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), lower_umin + ' && !' + lower_ior), 1246 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!'+lower_umax), 1247 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!'+lower_umin), 1248 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!'+lower_umin), 1249 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!'+lower_umax), 1250 1251 (('bcsel', ('ult', 'b@{}'.format(s), a), b, a), ('umin', a, b), '!'+lower_umin), 1252 (('bcsel', ('ult', 'a@{}'.format(s), b), b, a), ('umax', a, b), '!'+lower_umax), 1253 (('bcsel', ('uge', 'a@{}'.format(s), b), b, a), ('umin', a, b), '!'+lower_umin), 1254 (('bcsel', ('uge', 'b@{}'.format(s), a), b, a), ('umax', a, b), '!'+lower_umax), 1255 (('bcsel', ('ilt', 'b@{}'.format(s), a), b, a), ('imin', a, b), '!'+lower_imin), 1256 (('bcsel', ('ilt', 'a@{}'.format(s), b), b, a), ('imax', a, b), '!'+lower_imax), 1257 (('bcsel', ('ige', 'a@{}'.format(s), b), b, a), ('imin', a, b), '!'+lower_imin), 1258 (('bcsel', ('ige', 'b@{}'.format(s), a), b, a), ('imax', a, b), '!'+lower_imax), 1259 1260 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1261 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1262 1263 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1264 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1265 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1266 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1267 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), amount_bits - 1)), ('ushr', a, ('ishl', b, amount_bits - 1))), 1268 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 3), amount_bits - 2)), ('ushr', a, ('ishl', b, amount_bits - 2))), 1269 ]) 1270 1271optimizations.extend([ 1272 # Common pattern like 'if (i == 0 || i == 1 || ...)' 1273 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1274 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1275 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1276 (('ior', a, ('ieq', a, False)), True), 1277 1278 (('uge', a, 1), ('ine', a, 0)), 1279 (('ult', a, 1), ('ieq', a, 0)), 1280 1281 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1282 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1283 1284 (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'), 1285 ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)), 1286 ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))), 1287 ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b))) 1288 ) 1289 ), 1290 1291 (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), 1292 ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'), 1293 1294 (('ior', ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)), 1295 (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)), 1296 1297 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1298 # The first part of the iand comes from the !__feq64_nonnan. 1299 # 1300 # The second pattern is a reformulation of the first based on the relation 1301 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1302 # happens to be y == 0. 1303 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1304 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1305 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1306 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1307 1308 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1309 # transformations occur before constant propagation and loop-unrolling. 1310 # 1311 # The flt versions are exact. If isnan(a), the original pattern is 1312 # trivially false, and the replacements are false too. If isnan(b): 1313 # 1314 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1315 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1316 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1317 (('~fge', a, ('fmin', b, a)), True), 1318 (('~fge', ('fmax', a, b), a), True), 1319 (('flt', a, ('fmin', b, a)), False), 1320 (('flt', ('fmax', a, b), a), False), 1321 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1322 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1323 1324 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1325 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1326 (('ige', a, ('imin', b, a)), True), 1327 (('ige', ('imax', a, b), a), True), 1328 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1329 (('ult', ('umin', a, b), a), ('ult', b, a)), 1330 (('uge', a, ('umin', b, a)), True), 1331 (('uge', ('umax', a, b), a), True), 1332 (('ilt', a, ('imin', b, a)), False), 1333 (('ilt', ('imax', a, b), a), False), 1334 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1335 (('ige', ('imin', a, b), a), ('ige', b, a)), 1336 (('ult', a, ('umin', b, a)), False), 1337 (('ult', ('umax', a, b), a), False), 1338 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1339 (('uge', ('umin', a, b), a), ('uge', b, a)), 1340 (('ult', a, ('iand', b, a)), False), 1341 (('ult', ('ior', a, b), a), False), 1342 (('uge', a, ('iand', b, a)), True), 1343 (('uge', ('ior', a, b), a), True), 1344 1345 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1346 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1347 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1348 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1349 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1350 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1351 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1352 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1353 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1354 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1355 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1356 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1357 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1358 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1359 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1360 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1361 1362 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1363 # negative. 1364 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1365 ('iabs', ('ishr', a, b))), 1366 (('iabs', ('ishr', ('iabs', a), b)), ('ushr', ('iabs', a), b)), 1367 (('iabs', ('ushr', ('iabs', a), b)), ('ushr', ('iabs', a), b)), 1368 1369 (('fabs', ('slt', a, b)), ('slt', a, b)), 1370 (('fabs', ('sge', a, b)), ('sge', a, b)), 1371 (('fabs', ('seq', a, b)), ('seq', a, b)), 1372 (('fabs', ('sne', a, b)), ('sne', a, b)), 1373 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1374 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1375 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1376 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1377 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1378 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1379 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1380 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1381 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1382 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1383 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1384 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1385 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1386 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1387 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1388 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1389 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1390 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1391 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1392 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1393 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1394 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1395 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1396 (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'), 1397 (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'), 1398 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1399 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1400 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1401 (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1402 (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1403]) 1404 1405def vector_cmp(reduce_op, cmp_op, comps): 1406 if len(comps) == 1: 1407 return (cmp_op, 'a.' + comps[0], 'b.' + comps[0]) 1408 else: 1409 mid = len(comps) // 2 1410 return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]), 1411 vector_cmp(reduce_op, cmp_op, comps[mid:])) 1412 1413for op in [ 1414 ('ball_iequal', 'ieq', 'iand'), 1415 ('ball_fequal', 'feq', 'iand'), 1416 ('bany_inequal', 'ine', 'ior'), 1417 ('bany_fnequal', 'fneu', 'ior'), 1418]: 1419 optimizations.extend([ 1420 ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'), 1421 ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'), 1422 ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'), 1423 ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'), 1424 ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'), 1425 ]) 1426 1427# D3D Boolean emulation 1428for s in [8, 16, 32, 64]: 1429 cond = 'true' 1430 if s == 64: 1431 cond = '!(options->lower_int64_options & nir_lower_conv64)' 1432 1433 optimizations.extend([ 1434 (('bcsel@{}'.format(s), a, -1, 0), ('ineg', ('b2i', 'a@1')), cond), 1435 (('bcsel@{}'.format(s), a, 0, -1), ('ineg', ('b2i', ('inot', a))), cond), 1436 (('bcsel@{}'.format(s), a, 1, 0), ('b2i', 'a@1'), cond), 1437 (('bcsel@{}'.format(s), a, 0, 1), ('b2i', ('inot', a)), cond), 1438 ]) 1439 1440optimizations.extend([ 1441 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1442 ('ineg', ('b2i', ('iand', a, b)))), 1443 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1444 ('ineg', ('b2i', ('ior', a, b)))), 1445 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1446 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1447 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1448 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1449 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1450 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1451 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1452]) 1453 1454optimizations.extend([ 1455 (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1456 (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1457 (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1458 (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1459 (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1460 (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1461 (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1462 (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1463 (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1464 (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1465 (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1466 (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1467 (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1468 (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1469 (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1470 (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1471 1472 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1473 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1474 # Emulating booleans 1475 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1476 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1477 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1478 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1479 (('ffma', ('b2f', 'a@1'), ('b2f', 'b@1'), c), ('fadd', ('b2f', ('iand', a, b)), c)), 1480 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1481 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1482 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1483 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1484 # Comparison with the same args. Note that these are only done for the 1485 # float versions when the source must be a number. Generally, NaN cmp NaN 1486 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1487 # is false, and, for any number X, X < X is also false. 1488 (('ilt', a, a), False), 1489 (('ige', a, a), True), 1490 (('ieq', a, a), True), 1491 (('ine', a, a), False), 1492 (('ult', a, a), False), 1493 (('uge', a, a), True), 1494 (('flt', a, a), False), 1495 (('fge', 'a(is_a_number)', a), True), 1496 (('feq', 'a(is_a_number)', a), True), 1497 (('fneu', 'a(is_a_number)', a), False), 1498 # Logical and bit operations 1499 (('iand', a, a), a), 1500 (('iand', a, 0), 0), 1501 (('iand', a, -1), a), 1502 (('iand', a, ('inot', a)), 0), 1503 (('ior', a, a), a), 1504 (('ior', a, 0), a), 1505 (('ior', a, -1), -1), 1506 (('ior', a, ('inot', a)), -1), 1507 (('ixor', a, a), 0), 1508 (('ixor', a, 0), a), 1509 (('ixor', a, ('ixor', a, b)), b), 1510 (('ixor', a, -1), ('inot', a)), 1511 (('inot', ('inot', a)), a), 1512 (('ior', ('iand', a, b), b), b), 1513 (('ior', ('ior', a, b), b), ('ior', a, b)), 1514 (('iand', ('ior', a, b), b), b), 1515 (('iand', ('iand', a, b), b), ('iand', a, b)), 1516 1517 # It is common for sequences of (x & 1) to occur in large trees. Replacing 1518 # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "& 1519 # 1" to eventually bubble up to the top of the tree. 1520 (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), 1521 ('iand', a, ('iand', b, c))), 1522 1523 (('iand@64', a, '#b(is_lower_half_zero)'), 1524 ('pack_64_2x32_split', 0, 1525 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1526 '!options->lower_pack_64_2x32_split'), 1527 (('iand@64', a, '#b(is_upper_half_zero)'), 1528 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1529 0), 1530 '!options->lower_pack_64_2x32_split'), 1531 (('iand@64', a, '#b(is_lower_half_negative_one)'), 1532 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1533 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1534 '!options->lower_pack_64_2x32_split'), 1535 (('iand@64', a, '#b(is_upper_half_negative_one)'), 1536 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1537 ('unpack_64_2x32_split_y', a)), 1538 '!options->lower_pack_64_2x32_split'), 1539 1540 (('ior@64', a, '#b(is_lower_half_zero)'), 1541 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1542 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1543 '!options->lower_pack_64_2x32_split'), 1544 (('ior@64', a, '#b(is_upper_half_zero)'), 1545 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1546 ('unpack_64_2x32_split_y', a)), 1547 '!options->lower_pack_64_2x32_split'), 1548 (('ior@64', a, '#b(is_lower_half_negative_one)'), 1549 ('pack_64_2x32_split', -1, 1550 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1551 '!options->lower_pack_64_2x32_split'), 1552 (('ior@64', a, '#b(is_upper_half_negative_one)'), 1553 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1554 -1), 1555 '!options->lower_pack_64_2x32_split'), 1556 1557 (('ixor@64', a, '#b(is_lower_half_zero)'), 1558 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1559 ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1560 '!options->lower_pack_64_2x32_split'), 1561 (('ixor@64', a, '#b(is_upper_half_zero)'), 1562 ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1563 ('unpack_64_2x32_split_y', a)), 1564 '!options->lower_pack_64_2x32_split'), 1565 1566 # DeMorgan's Laws 1567 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1568 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1569 # Shift optimizations 1570 (('ishl', 0, a), 0), 1571 (('ishl', a, 0), a), 1572 (('ishr', 0, a), 0), 1573 (('ishr', -1, a), -1), 1574 (('ishr', a, 0), a), 1575 (('ushr', 0, a), 0), 1576 (('ushr', a, 0), a), 1577 (('bcsel', ('ieq', b, 0), a, ('ushr', a, b)), ('ushr', a, b)), 1578 (('bcsel', ('ieq', b, 0), a, ('ishr', a, b)), ('ishr', a, b)), 1579 (('bcsel', ('ieq', b, 0), a, ('ishl', a, b)), ('ishl', a, b)), 1580 (('bcsel', ('ine', b, 0), ('ushr', a, b), a), ('ushr', a, b)), 1581 (('bcsel', ('ine', b, 0), ('ishr', a, b), a), ('ishr', a, b)), 1582 (('bcsel', ('ine', b, 0), ('ishl', a, b), a), ('ishl', a, b)), 1583 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'), 1584 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'), 1585 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'), 1586 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'), 1587 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'), 1588 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'), 1589 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'), 1590 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'), 1591 (('urol@8', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 8, b))), '!options->has_rotate8'), 1592 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'), 1593 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'), 1594 (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))), 1595 (('uror@8', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 8, b))), '!options->has_rotate8'), 1596 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'), 1597 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'), 1598 (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))), 1599 1600 (('bitfield_select', 0xff000000, ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1601 (('bitfield_select', 0xffff0000, ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1602 (('bitfield_select', 0xffffff00, ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1603 (('ior', ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1604 (('ior', ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1605 (('ior', ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1606 (('bcsel', ('ieq', c, 0), a, ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c))), ('shfr', b, a, c), 'options->has_shfr32'), 1607 (('bcsel', ('ine', c, 0), ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c)), a), ('shfr', b, a, c), 'options->has_shfr32'), 1608 (('ior', ('ishl', 'a@32', ('iadd', 32, ('ineg', b))), ('ushr@32', a, b)), ('shfr', a, a, b), 'options->has_shfr32 && !options->has_rotate32'), 1609 1610 # bfi(X, a, b) = (b & ~X) | (a & X) 1611 # If X = ~0: (b & 0) | (a & 0xffffffff) = a 1612 # If X = 0: (b & 0xffffffff) | (a & 0) = b 1613 (('bfi', 0xffffffff, a, b), a), 1614 (('bfi', 0x00000000, a, b), b), 1615 1616 # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the 1617 # bfi is either b or c. 1618 (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)), 1619 1620 # bfi(a, 0, 0) = ((0 << find_lsb(a)) & a) | (0 & ~a) 1621 # = 0 1622 (('bfi', a, 0, 0), 0), 1623 1624 # bfi(a, b, b) = ((b << find_lsb(a)) & a) | (b & ~a) 1625 # = (a & b) | (b & ~a) If a is odd, find_lsb(a) == 0 1626 # = b 1627 (('bfi', '#a(is_odd)', b, b), b), 1628 1629 # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a) 1630 # = (a & a) | (b & ~a) If a is odd, find_lsb(a) == 0 1631 # = a | (b & ~a) 1632 # = a | b 1633 (('bfi', '#a(is_odd)', a, b), ('ior', a, b)), 1634 1635 # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a) 1636 # = ((b << find_lsb(a)) & a) 1637 # = (b & a) If a is odd, find_lsb(a) == 0 1638 (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)), 1639 1640 # Because 'a' is a positive power of two, the result of the bfi is either 0 1641 # or 'a' depending on whether or not 'b' is odd. Use 'b&1' for the zero 1642 # value to help platforms that can't have two constants in a bcsel. 1643 (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1644 ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))), 1645 (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1646 ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))), 1647 1648 # Exponential/logarithmic identities 1649 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1650 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1651 # 32-bit fpow should use fmulz to fix https://gitlab.freedesktop.org/mesa/mesa/-/issues/11464 (includes apitrace) 1652 (('fpow@32', a, b), ('fexp2', ('fmulz', ('flog2', a), b)), 'options->lower_fpow && ' + has_fmulz), # a^b = 2^(lg2(a)*b) 1653 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1654 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1655 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1656 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1657 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1658 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1659 (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)), 1660 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1661 (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)), 1662 (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))), 1663 (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))), 1664 (('~fpow', a, 1.0), a), 1665 (('~fpow', a, 2.0), ('fmul', a, a)), 1666 (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)), 1667 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1668 (('~fpow', 2.0, a), ('fexp2', a)), 1669 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1670 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1671 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1672 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1673 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1674 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1675 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1676 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1677 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1678 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1679 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1680 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1681 (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1682 # Division and reciprocal 1683 (('~fdiv', 1.0, a), ('frcp', a)), 1684 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1685 (('~frcp', ('frcp', a)), a), 1686 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1687 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1688 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1689 # Trig 1690 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1691 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1692 # Boolean simplifications 1693 (('ieq', a, True), a), 1694 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1695 (('ine', a, False), a), 1696 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1697 (('bcsel', a, True, False), a), 1698 (('bcsel', a, False, True), ('inot', a)), 1699 (('bcsel', True, b, c), b), 1700 (('bcsel', False, b, c), c), 1701 1702 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1703 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1704 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1705 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1706 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1707 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1708 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1709 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1710 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1711 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1712 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1713 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1714 1715 (('bcsel', a, b, b), b), 1716 (('~fcsel', a, b, b), b), 1717 1718 # With D3D booleans, imax is AND and umax is OR 1719 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1720 ('ineg', ('b2i', ('iand', a, b)))), 1721 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1722 ('ineg', ('b2i', ('ior', a, b)))), 1723 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1724 ('ineg', ('b2i', ('ior', a, b)))), 1725 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1726 ('ineg', ('b2i', ('iand', a, b)))), 1727 (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1728 (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1729 1730 # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op. 1731 (('iand', ('b2i', a), 1), ('b2i', a)), 1732 1733 (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))), 1734 (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))), 1735 1736 # Conversions 1737 (('f2i', ('ftrunc', a)), ('f2i', a)), 1738 (('f2u', ('ftrunc', a)), ('f2u', a)), 1739 1740 # Conversions from 16 bits to 32 bits and back can always be removed 1741 (('f2fmp', ('f2f32', 'a@16')), a), 1742 (('i2imp', ('i2i32', 'a@16')), a), 1743 (('i2imp', ('u2u32', 'a@16')), a), 1744 1745 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1746 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1747 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1748 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1749 1750 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1751 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1752 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1753 1754 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1755 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1756 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1757 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1758 1759 # Conversions to 16 bits would be lossy so they should only be removed if 1760 # the instruction was generated by the precision lowering pass. 1761 (('f2f32', ('f2fmp', 'a@32')), a), 1762 (('i2i32', ('i2imp', 'a@32')), a), 1763 (('u2u32', ('i2imp', 'a@32')), a), 1764 1765 # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32 1766 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1767 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1768 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1769 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1770 1771 # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32 1772 (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)), 1773 (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)), 1774 (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)), 1775 1776 (('ffloor', 'a(is_integral)'), a), 1777 (('fceil', 'a(is_integral)'), a), 1778 (('ftrunc', 'a(is_integral)'), a), 1779 (('fround_even', 'a(is_integral)'), a), 1780 1781 # fract(x) = x - floor(x), so fract(NaN) = NaN 1782 (('~ffract', 'a(is_integral)'), 0.0), 1783 (('fabs', 'a(is_not_negative)'), a), 1784 (('iabs', 'a(is_not_negative)'), a), 1785 (('fsat', 'a(is_not_positive)'), 0.0), 1786 1787 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1788 1789 # The result of the multiply must be in [-1, 0], so the result of the ffma 1790 # must be in [0, 1]. 1791 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1792 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1793 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1794 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1795 1796 (('fneu', 'a(is_not_zero)', 0.0), True), 1797 (('feq', 'a(is_not_zero)', 0.0), False), 1798 1799 # In this chart, + means value > 0 and - means value < 0. 1800 # 1801 # + >= + -> unknown 0 >= + -> false - >= + -> false 1802 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1803 # + >= - -> true 0 >= - -> true - >= - -> unknown 1804 # 1805 # Using grouping conceptually similar to a Karnaugh map... 1806 # 1807 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1808 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1809 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1810 # 1811 # The flt / ilt cases just invert the expected result. 1812 # 1813 # The results expecting true, must be marked imprecise. The results 1814 # expecting false are fine because NaN compared >= or < anything is false. 1815 1816 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1817 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1818 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1819 1820 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1821 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1822 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1823 1824 (('ine', 'a(is_not_zero)', 0), True), 1825 (('ieq', 'a(is_not_zero)', 0), False), 1826 1827 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1828 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1829 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1830 1831 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1832 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1833 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1834 1835 (('ult', 0, 'a(is_gt_zero)'), True), 1836 (('ult', a, 0), False), 1837]) 1838 1839# Packing and then unpacking does nothing 1840for pack, bits, compbits in [('pack_64_2x32', 64, 32), ('pack_32_2x16', 32, 16)]: 1841 unpack = 'un' + pack 1842 optimizations += [ 1843 ((unpack + '_split_x', (pack + '_split', a, b)), a), 1844 ((unpack + '_split_y', (pack + '_split', a, b)), b), 1845 ((unpack + '_split_x', (pack, a)), 'a.x'), 1846 ((unpack + '_split_y', (pack, a)), 'a.y'), 1847 ((unpack + '_split_x', ('u2u' + str(bits), 'a@' + str(compbits))), a), 1848 ((unpack + '_split_x', ('i2i' + str(bits), 'a@' + str(compbits))), a), 1849 ((unpack + '_split_y', ('i2i' + str(bits) + '(is_used_once)', 'a@' + str(compbits))), ('ishr', a, compbits - 1)), 1850 ((unpack, (pack + '_split', a, b)), ('vec2', a, b)), 1851 ((unpack, (pack, a)), a), 1852 ((pack + '_split', (unpack + '_split_x', a), (unpack + '_split_y', a)), a), 1853 ((pack + '_split', (unpack, a), (unpack + '.y', a)), a), 1854 ((pack, ('vec2', (unpack + '_split_x', a), (unpack + '_split_y', a))), a), 1855 ((pack, (unpack, a)), a), 1856 ] 1857 1858optimizations.extend([ 1859 (('unpack_64_2x32_split_y', ('u2u64', 'a@1')), 0), 1860 (('unpack_64_2x32_split_y', ('u2u64', 'a@8')), 0), 1861 (('unpack_64_2x32_split_y', ('u2u64', 'a@16')), 0), 1862 (('unpack_64_2x32_split_y', ('u2u64', 'a@32')), 0), # Don't do that for u64 -> u64 1863 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1864 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1865 1866 (('unpack_64_4x16', ('pack_64_4x16', a)), a), 1867 (('pack_64_4x16', ('unpack_64_4x16', a)), a), 1868 (('unpack_32_4x8', ('pack_32_4x8', a)), a), 1869 (('pack_32_4x8', ('unpack_32_4x8', a)), a), 1870 1871 (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)), 1872 (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)), 1873 1874 (('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)), 1875 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1876 (('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), 1877 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1878 (('pack_64_2x32', ('vec2', ('pack_32_2x16', ('vec2', a, b)), ('pack_32_2x16', ('vec2', c, d)))), 1879 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1880 1881 # Comparing two halves of an unpack separately. While this optimization 1882 # should be correct for non-constant values, it's less obvious that it's 1883 # useful in that case. For constant values, the pack will fold and we're 1884 # guaranteed to reduce the whole tree to one instruction. 1885 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1886 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1887 ('ieq', a, ('pack_32_2x16_split', b, c))), 1888 1889 # Byte extraction 1890 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1891 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1892 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1893 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1894 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1895 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1896 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1897 (('ishr', ('iand', 'a@32', 0x0000ff00), 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1898 (('ishr', ('iand', 'a@64', 0x0000ff00), 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1899 (('ishr', ('iand', a, 0x00ff0000), 16), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1900 1901 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1902 # storage buffer. 1903 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1904 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1905 1906 # Common pattern after lowering 8-bit integers to 16-bit. 1907 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1908 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1909 1910 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1911 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1912 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1913 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1914 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1915 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1916 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1917 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1918 1919 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1920 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1921 1922 # The extract_X8(a & 0xff) patterns aren't included because the iand will 1923 # already be converted to extract_u8. 1924 (('extract_i8', ('iand', a, 0x0000ff00), 1), ('extract_i8', a, 1)), 1925 (('extract_i8', ('iand', a, 0x00ff0000), 2), ('extract_i8', a, 2)), 1926 (('extract_i8', ('iand', a, 0xff000000), 3), ('extract_i8', a, 3)), 1927 1928 (('extract_u8', ('iand', a, 0x0000ff00), 1), ('extract_u8', a, 1)), 1929 (('extract_u8', ('iand', a, 0x00ff0000), 2), ('extract_u8', a, 2)), 1930 (('extract_u8', ('iand', a, 0xff000000), 3), ('extract_u8', a, 3)), 1931 1932 (('iand', ('extract_u8', a, 0), '#b'), ('iand', a, ('iand', b, 0x00ff))), 1933 (('iand', ('extract_u16', a, 0), '#b'), ('iand', a, ('iand', b, 0xffff))), 1934 1935 (('ieq', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1936 (('ine', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1937 (('ieq', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1938 (('ine', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1939 1940 # Word extraction 1941 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1942 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1943 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1944 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1945 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1946 1947 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1948 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1949 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1950 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1951 1952 # Packing a u8vec4 to write to an SSBO. 1953 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1954 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1955 1956 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1957 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1958 1959 # The extract_X16(a & 0xff) patterns aren't included because the iand will 1960 # already be converted to extract_u8. 1961 (('extract_i16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), # extract_u8 is correct 1962 (('extract_u16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1963 1964 # Lower pack/unpack 1965 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1966 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'), 1967 (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'), 1968 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1969 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1970 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1971 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1972 1973 (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1974 (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1975 1976 # Useless masking before unpacking 1977 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1978 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1979 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1980 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1981 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1982 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1983 1984 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1985 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1986 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1987 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1988 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1989 1990 # Optimize half packing 1991 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1992 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1993 1994 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1995 ('pack_half_2x16', ('vec2', a, b))), 1996 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1997 ('pack_half_2x16', ('vec2', a, b))), 1998 1999 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 2000 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 2001 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 2002 2003 (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)), 2004 (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)), 2005 (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)), 2006 2007 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 2008 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 2009 2010 (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 2011 (('ior', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 2012 2013 (('pack_uint_2x16', ('vec2', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', b, 0))), ('pack_half_2x16_rtz_split', a, b)), 2014 2015 (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)), 2016 ('pack_half_2x16_split', c, a)), 2017 2018 # The important part here is that ~0xf & 0xfffffffc = ~0xf. 2019 (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc), 2020 ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)), 2021 (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc), 2022 ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)), 2023 2024 # 0x0f << 3 == 0x78, so that's already the maximum possible value. 2025 (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)), 2026 2027 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 2028 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 2029 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 2030 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 2031 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 2032 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 2033 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 2034 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 2035 2036 # Reduce intermediate precision with int64. 2037 (('u2u32', ('iadd(is_used_once)', 'a@64', b)), 2038 ('iadd', ('u2u32', a), ('u2u32', b))), 2039 2040 (('u2u32', ('imul(is_used_once)', 'a@64', b)), 2041 ('imul', ('u2u32', a), ('u2u32', b))), 2042 2043 (('u2f32', ('u2u64', 'a@32')), ('u2f32', a)), 2044 2045 # UINT32_MAX < a just checks the high half of a 64-bit value. This occurs 2046 # when lowering convert_uint_sat(ulong). Although the replacement is more 2047 # instructions, it replaces a 64-bit instruction with a 32-bit instruction 2048 # and a move that will likely be coalesced. 2049 (('ult', 0xffffffff, 'a@64'), ('ine', ('unpack_64_2x32_split_y', a), 0)), 2050 2051 # Redundant trip through 8-bit 2052 (('i2i16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)), 2053 (('u2u16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)), 2054 2055 # Reduce 16-bit integers to 1-bit booleans, hit with OpenCL. In turn, this 2056 # lets iand(b2i1(...), 1) get simplified. Backends can usually fuse iand/inot 2057 # so this should be no worse when it isn't strictly better. 2058 (('bcsel', a, 0, ('b2i16', 'b@1')), ('b2i16', ('iand', ('inot', a), b))), 2059 (('bcsel', a, ('b2i16', 'b@1'), ('b2i16', 'c@1')), ('b2i16', ('bcsel', a, b, c))), 2060 2061 # Lowered pack followed by lowered unpack, for the high bits 2062 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@8')), 32)), ('u2u32', a)), 2063 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@16')), 32)), ('u2u32', a)), 2064 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@32')), 32)), ('u2u32', a)), 2065 (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@8')), 16)), ('u2u16', a)), 2066 (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@16')), 16)), ('u2u16', a)), 2067]) 2068 2069# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 2070# patterns like those below. 2071for op in ('ushr', 'ishr'): 2072 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 2073 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 2074 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 2075 2076optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 2077 2078# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 2079# patterns like those below. 2080for op in ('extract_u8', 'extract_i8'): 2081 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 2082 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 2083 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 2084 2085for op, repl in [('ieq', 'ieq'), ('ine', 'ine'), 2086 ('ult', 'ult'), ('ilt', 'ult'), 2087 ('uge', 'uge'), ('ige', 'uge')]: 2088 optimizations.extend([ 2089 ((op, ('pack_64_2x32_split', a, 0), ('pack_64_2x32_split', b, 0)), (repl, a, b)), 2090 ((op, ('pack_64_2x32_split', a, 0), '#b(is_upper_half_zero)'), (repl, a, ('unpack_64_2x32_split_x', b))), 2091 ((op, '#a(is_upper_half_zero)', ('pack_64_2x32_split', b, 0)), (repl, ('unpack_64_2x32_split_x', a), b)), 2092 2093 ((op, ('pack_64_2x32_split', 0, a), ('pack_64_2x32_split', 0, b)), (op, a, b)), 2094 ((op, ('pack_64_2x32_split', 0, a), '#b(is_lower_half_zero)'), (op, a, ('unpack_64_2x32_split_y', b))), 2095 ((op, '#a(is_lower_half_zero)', ('pack_64_2x32_split', 0, b)), (op, ('unpack_64_2x32_split_y', a), b)), 2096 ]) 2097 2098optimizations.extend([ 2099 # Subtracts 2100 (('ussub_4x8_vc4', a, 0), a), 2101 (('ussub_4x8_vc4', a, ~0), 0), 2102 # Lower all Subtractions first - they can get recombined later 2103 (('fsub', a, b), ('fadd', a, ('fneg', b))), 2104 (('isub', a, b), ('iadd', a, ('ineg', b))), 2105 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 2106 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 2107 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 2108 (('bitz', a, b), ('inot', ('bitnz', a, b))), 2109 2110 # Propagate negation up multiplication chains 2111 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 2112 (('fmulz(is_used_by_non_fsat,nsz)', ('fneg', a), b), ('fneg', ('fmulz', a, b))), 2113 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 2114 (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 2115 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 2116 2117 # Propagate constants up multiplication chains 2118 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 2119 (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 2120 (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 2121 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 2122 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 2123 (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 2124 (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 2125 # Prefer moving out a multiplication for more MAD/FMA-friendly code 2126 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 2127 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 2128 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 2129 (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 2130 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 2131 2132 # Reassociate constants in add/mul chains so they can be folded together. 2133 # For now, we mostly only handle cases where the constants are separated by 2134 # a single non-constant. We could do better eventually. 2135 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 2136 (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 2137 (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 2138 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 2139 (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 2140 (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 2141 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 2142 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 2143 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 2144 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 2145 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2146 (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 2147 (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2148 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 2149 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 2150 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 2151 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 2152 (('ior', ('iand', a, '#c'), ('ior', b, ('iand', a, '#d'))), ('ior', b, ('iand', a, ('ior', c, d)))), 2153 2154 # Reassociate add chains for more MAD/FMA-friendly code 2155 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 2156 2157 # Drop mul-div by the same value when there's no wrapping. 2158 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 2159 2160 # By definition... 2161 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 2162 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2163 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2164 (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2165 (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2166 2167 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 2168 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2169 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2170 (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2171 (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2172 2173 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 2174 (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2175 2176 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2177 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2178 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2179 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2180 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2181 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2182 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2183 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2184 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2185 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2186 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2187 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2188 2189 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'), 2190 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2191 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2192 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2193 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'), 2194 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2195 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2196 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2197 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2198 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2199 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2200 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2201 2202 # Clear the LSB 2203 (('iand', a, ('inot', ('ishl', 1, ('find_lsb', a)))), ('iand', a, ('inot', ('ineg', a)))), 2204 2205 # This is safe. Both ufind_msb_rev and bitfield_reverse can only have 2206 # 32-bit sources, so the transformation can only generate correct NIR. 2207 (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2208 (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'), 2209 2210 (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))), 2211 (('ifind_msb', ('extract_u8', a, b)), ('ufind_msb', ('extract_u8', a, b))), 2212 (('ifind_msb', ('extract_u16', a, b)), ('ufind_msb', ('extract_u16', a, b))), 2213 (('ifind_msb', ('imax', a, 1)), ('ufind_msb', ('imax', a, 1))), 2214 2215 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2216 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2217 (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2218 (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2219 (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)), 2220 (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)), 2221 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 2222 2223 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 2224 (('bcsel', a, ('bcsel(is_used_once)', b, d, c), d), ('bcsel', ('iand', a, ('inot', b)), c, d)), 2225 (('bcsel', a, b, ('bcsel(is_used_once)', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 2226 (('bcsel', a, b, ('bcsel(is_used_once)', c, d, b)), ('bcsel', ('iand', c, ('inot', a)), d, b)), 2227 2228 # Misc. lowering 2229 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 2230 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 2231 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 2232 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 2233 2234 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2235 ('bcsel', ('ult', 31, 'bits'), 'insert', 2236 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 2237 'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'), 2238 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2239 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2240 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2241 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2242 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2243 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2244 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2245 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2246 2247 (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 2248 (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 2249 2250 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 2251 'options->lower_uadd_sat || (options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64)) != 0'), 2252 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 2253 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 2254 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 2255 2256 # int64_t sum = a + b; 2257 # 2258 # if (a < 0 && b < 0 && a < sum) 2259 # sum = INT64_MIN; 2260 # } else if (a >= 0 && b >= 0 && sum < a) 2261 # sum = INT64_MAX; 2262 # } 2263 # 2264 # A couple optimizations are applied. 2265 # 2266 # 1. a < sum => sum >= 0. This replacement works because it is known that 2267 # a < 0 and b < 0, so sum should also be < 0 unless there was 2268 # underflow. 2269 # 2270 # 2. sum < a => sum < 0. This replacement works because it is known that 2271 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 2272 # overflow. 2273 # 2274 # 3. Invert the second if-condition and swap the order of parameters for 2275 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 2276 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 2277 # 2278 # On Intel Gen11, this saves ~11 instructions. 2279 (('iadd_sat@64', a, b), ('bcsel', 2280 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2281 0x8000000000000000, 2282 ('bcsel', 2283 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2284 ('iadd', a, b), 2285 0x7fffffffffffffff)), 2286 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2287 2288 # int64_t sum = a - b; 2289 # 2290 # if (a < 0 && b >= 0 && a < sum) 2291 # sum = INT64_MIN; 2292 # } else if (a >= 0 && b < 0 && a >= sum) 2293 # sum = INT64_MAX; 2294 # } 2295 # 2296 # Optimizations similar to the iadd_sat case are applied here. 2297 (('isub_sat@64', a, b), ('bcsel', 2298 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2299 0x8000000000000000, 2300 ('bcsel', 2301 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2302 ('isub', a, b), 2303 0x7fffffffffffffff)), 2304 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2305 2306 # These are done here instead of in the backend because the int64 lowering 2307 # pass will make a mess of the patterns. The first patterns are 2308 # conditioned on nir_lower_minmax64 because it was not clear that it was 2309 # always an improvement on platforms that have real int64 support. No 2310 # shaders in shader-db hit this, so it was hard to say one way or the 2311 # other. 2312 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2313 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2314 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2315 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2316 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2317 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2318 2319 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2320 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2321 # 0u < uint(a) <=> uint(a) != 0u 2322 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2323 2324 # Alternative lowering that doesn't rely on bfi. 2325 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2326 ('bcsel', ('ult', 31, 'bits'), 2327 'insert', 2328 (('ior', 2329 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 2330 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 2331 'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'), 2332 2333 # Alternative lowering that uses bitfield_select. 2334 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2335 ('bcsel', ('ult', 31, 'bits'), 'insert', 2336 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 2337 'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'), 2338 2339 (('ibitfield_extract', 'value', 'offset', 'bits'), 2340 ('bcsel', ('ult', 31, 'bits'), 'value', 2341 ('ibfe', 'value', 'offset', 'bits')), 2342 'options->lower_bitfield_extract && options->has_bfe'), 2343 2344 (('ubitfield_extract', 'value', 'offset', 'bits'), 2345 ('bcsel', ('ult', 31, 'bits'), 'value', 2346 ('ubfe', 'value', 'offset', 'bits')), 2347 'options->lower_bitfield_extract && options->has_bfe'), 2348 2349 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 2350 (('bitfield_select', a, b, 0), ('iand', a, b)), 2351 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 2352 2353 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 2354 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 2355 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 2356 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 2357 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 2358 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 2359 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 2360 2361 # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such 2362 (('ult', a, ('umin', ('iand', a, b), c)), False), 2363 (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False), 2364 (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2365 ('ubfe', 'value', 'offset', 'width')), 2366 (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2367 ('ibfe', 'value', 'offset', 'width')), 2368 (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'), 2369 ('bfm', 'width', 'offset')), 2370 2371 # open-coded BFM 2372 (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'), 2373 (('ishl', ('bfm', a, 0), b), ('bfm', a, b)), 2374 2375 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2376 # 2377 # If bits is zero, the result will be zero. 2378 # 2379 # These patterns prevent other patterns from generating invalid results 2380 # when count is zero. 2381 (('ubfe', a, b, 0), 0), 2382 (('ibfe', a, b, 0), 0), 2383 2384 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 2385 2386 (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)), 2387 (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 2388 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2389 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2390 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2391 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2392 2393 (('ibitfield_extract', 'value', 'offset', 'bits'), 2394 ('bcsel', ('ieq', 0, 'bits'), 2395 0, 2396 ('ishr', 2397 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 2398 ('isub', 32, 'bits'))), 2399 'options->lower_bitfield_extract && !options->has_bfe'), 2400 2401 (('ubitfield_extract', 'value', 'offset', 'bits'), 2402 ('iand', 2403 ('ushr', 'value', 'offset'), 2404 ('bcsel', ('ieq', 'bits', 32), 2405 0xffffffff, 2406 ('isub', ('ishl', 1, 'bits'), 1))), 2407 'options->lower_bitfield_extract && !options->has_bfe'), 2408 2409 (('ifind_msb', 'value'), 2410 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 2411 'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'), 2412 2413 (('ifind_msb', 'value'), 2414 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 2415 ('isub', 31, ('ifind_msb_rev', 'value')), 2416 ('ifind_msb_rev', 'value')), 2417 'options->lower_ifind_msb && options->has_find_msb_rev'), 2418 2419 # uclz of an absolute value source almost always does the right thing. 2420 # There are a couple problem values: 2421 # 2422 # * 0x80000000. Since abs(0x80000000) == 0x80000000, uclz returns 0. 2423 # However, findMSB(int(0x80000000)) == 30. 2424 # 2425 # * 0xffffffff. Since abs(0xffffffff) == 1, uclz returns 31. Section 8.8 2426 # (Integer Functions) of the GLSL 4.50 spec says: 2427 # 2428 # For a value of zero or negative one, -1 will be returned. 2429 # 2430 # * Negative powers of two. uclz(abs(-(1<<x))) returns x, but 2431 # findMSB(-(1<<x)) should return x-1. 2432 # 2433 # For all negative number cases, including 0x80000000 and 0xffffffff, the 2434 # correct value is obtained from uclz if instead of negating the (already 2435 # negative) value the logical-not is used. A conditional logical-not can 2436 # be achieved by (x ^ (x >> 31)). 2437 (('ifind_msb', 'value'), 2438 ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))), 2439 'options->lower_ifind_msb && options->has_uclz'), 2440 2441 (('ufind_msb', 'value@32'), 2442 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 2443 ('isub', 31, ('ufind_msb_rev', 'value')), 2444 ('ufind_msb_rev', 'value')), 2445 'options->lower_ufind_msb && options->has_find_msb_rev'), 2446 2447 (('ufind_msb', 'value@32'), 2448 ('isub', 31, ('uclz', 'value')), 2449 'options->lower_ufind_msb && options->has_uclz'), 2450 2451 (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'), 2452 2453 (('find_lsb', 'value@64'), 2454 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 2455 'options->lower_find_lsb'), 2456 2457 (('find_lsb', 'value'), 2458 ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))), 2459 'options->lower_find_lsb'), 2460 2461 (('extract_i8', a, 'b@32'), 2462 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 2463 'options->lower_extract_byte'), 2464 2465 (('extract_u8', a, 'b@32'), 2466 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 2467 'options->lower_extract_byte'), 2468 2469 (('extract_i16', a, 'b@32'), 2470 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 2471 'options->lower_extract_word'), 2472 2473 (('extract_u16', a, 'b@32'), 2474 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 2475 'options->lower_extract_word'), 2476 2477 (('pack_unorm_2x16', 'v'), 2478 ('pack_uvec2_to_uint', 2479 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 2480 'options->lower_pack_unorm_2x16'), 2481 2482 (('pack_unorm_4x8', 'v'), 2483 ('pack_uvec4_to_uint', 2484 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2485 'options->lower_pack_unorm_4x8 && !options->has_pack_32_4x8'), 2486 2487 (('pack_unorm_4x8', 'v'), 2488 ('pack_32_4x8', 2489 ('f2u8', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2490 'options->lower_pack_unorm_4x8 && options->has_pack_32_4x8'), 2491 2492 (('pack_snorm_2x16', 'v'), 2493 ('pack_uvec2_to_uint', 2494 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 2495 'options->lower_pack_snorm_2x16'), 2496 2497 (('pack_snorm_4x8', 'v'), 2498 ('pack_uvec4_to_uint', 2499 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2500 'options->lower_pack_snorm_4x8 && !options->has_pack_32_4x8'), 2501 2502 (('pack_snorm_4x8', 'v'), 2503 ('pack_32_4x8', 2504 ('f2i8', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2505 'options->lower_pack_snorm_4x8 && options->has_pack_32_4x8'), 2506 2507 (('unpack_unorm_2x16', 'v'), 2508 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 2509 ('extract_u16', 'v', 1))), 2510 65535.0), 2511 'options->lower_unpack_unorm_2x16'), 2512 2513 (('unpack_unorm_4x8', 'v'), 2514 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 2515 ('extract_u8', 'v', 1), 2516 ('extract_u8', 'v', 2), 2517 ('extract_u8', 'v', 3))), 2518 255.0), 2519 'options->lower_unpack_unorm_4x8'), 2520 2521 (('unpack_snorm_2x16', 'v'), 2522 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 2523 ('extract_i16', 'v', 1))), 2524 32767.0))), 2525 'options->lower_unpack_snorm_2x16'), 2526 2527 (('unpack_snorm_4x8', 'v'), 2528 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 2529 ('extract_i8', 'v', 1), 2530 ('extract_i8', 'v', 2), 2531 ('extract_i8', 'v', 3))), 2532 127.0))), 2533 'options->lower_unpack_snorm_4x8'), 2534 2535 (('pack_half_2x16_split', 'a@32', 'b@32'), 2536 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 2537 'options->lower_pack_split'), 2538 2539 (('unpack_half_2x16_split_x', 'a@32'), 2540 ('f2f32', ('u2u16', a)), 2541 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2542 2543 (('unpack_half_2x16_split_x', 'a@32'), 2544 ('f2f32', ('fmul', 1.0, ('u2u16', a))), 2545 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2546 2547 (('unpack_half_2x16_split_y', 'a@32'), 2548 ('f2f32', ('u2u16', ('ushr', a, 16))), 2549 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2550 2551 (('unpack_half_2x16_split_y', 'a@32'), 2552 ('f2f32', ('fmul', 1.0, ('u2u16', ('ushr', a, 16)))), 2553 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2554 2555 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 2556 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 2557 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 2558 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 2559 # Mark the new comparisons precise to prevent them being changed to 'a != 2560 # 0' or 'a == 0'. 2561 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 2562 (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'), 2563 2564 # Address/offset calculations: 2565 # Drivers supporting imul24 should use a pass like nir_lower_amul(), this 2566 # rule converts everyone else to imul: 2567 (('amul', a, b), ('imul', a, b), '!options->has_imul24 && !options->has_amul'), 2568 2569 # udiv_aligned_4 assumes the source is a multiple of 4 specifically to enable 2570 # this identity. Usually this transform would require masking. 2571 (('amul', ('udiv_aligned_4', a), 4), a), 2572 (('imul', ('udiv_aligned_4', a), 4), a), 2573 2574 (('umul24', a, b), 2575 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 2576 '!options->has_umul24'), 2577 (('umad24', a, b, c), 2578 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 2579 '!options->has_umad24'), 2580 2581 # Relaxed 24bit ops 2582 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 2583 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 2584 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 2585 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 2586 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 2587 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 2588 2589 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 2590 (('imad24_ir3', a, 0, c), (c)), 2591 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 2592 2593 # if first two srcs are const, crack apart the imad so constant folding 2594 # can clean up the imul: 2595 # TODO ffma should probably get a similar rule: 2596 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 2597 2598 # These will turn 24b address/offset calc back into 32b shifts, but 2599 # it should be safe to get back some of the bits of precision that we 2600 # already decided were no necessary: 2601 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 2602 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 2603 (('imul24', a, 0), (0)), 2604]) 2605 2606for bit_size in [8, 16, 32, 64]: 2607 cond = '!options->lower_uadd_sat' 2608 if bit_size == 64: 2609 cond += ' && !(options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64))' 2610 add = 'iadd@' + str(bit_size) 2611 2612 optimizations += [ 2613 (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2614 (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond), 2615 (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond), 2616 (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2617 ] 2618 2619for bit_size in [8, 16, 32, 64]: 2620 cond = '!options->lower_usub_sat' 2621 if bit_size == 64: 2622 cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)' 2623 add = 'iadd@' + str(bit_size) 2624 2625 optimizations += [ 2626 (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2627 (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2628 (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2629 (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2630 ] 2631 2632# bit_size dependent lowerings 2633for bit_size in [8, 16, 32, 64]: 2634 # convenience constants 2635 intmax = (1 << (bit_size - 1)) - 1 2636 intmin = 1 << (bit_size - 1) 2637 2638 optimizations += [ 2639 (('iadd_sat@' + str(bit_size), a, b), 2640 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2641 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2642 (('isub_sat@' + str(bit_size), a, b), 2643 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2644 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2645 ] 2646 2647invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2648 2649for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2650 optimizations.append((('inot', ('ior(is_used_once)', (left + '(is_used_once)', a, b), 2651 (right + '(is_used_once)', c, d))), 2652 ('iand', (invert[left], a, b), (invert[right], c, d)))) 2653 optimizations.append((('inot', ('iand(is_used_once)', (left + '(is_used_once)', a, b), 2654 (right + '(is_used_once)', c, d))), 2655 ('ior', (invert[left], a, b), (invert[right], c, d)))) 2656 2657# Optimize x2yN(b2x(x)) -> b2y 2658for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2659 if x != 'f' and y != 'f' and x != y: 2660 continue 2661 2662 b2x = 'b2f' if x == 'f' else 'b2i' 2663 b2y = 'b2f' if y == 'f' else 'b2i' 2664 x2yN = '{}2{}'.format(x, y) 2665 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2666 2667# Optimize away x2xN(a@N) 2668for t in ['int', 'uint', 'float', 'bool']: 2669 for N in type_sizes(t): 2670 x2xN = '{0}2{0}{1}'.format(t[0], N) 2671 aN = 'a@{0}'.format(N) 2672 optimizations.append(((x2xN, aN), a)) 2673 2674# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2675# In particular, we can optimize away everything except upcast of downcast and 2676# upcasts where the type differs from the other cast 2677for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2678 if N < M: 2679 # The outer cast is a down-cast. It doesn't matter what the size of the 2680 # argument of the inner cast is because we'll never been in the upcast 2681 # of downcast case. Regardless of types, we'll always end up with y2yN 2682 # in the end. 2683 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2684 x2xN = '{0}2{0}{1}'.format(x, N) 2685 y2yM = '{0}2{0}{1}'.format(y, M) 2686 y2yN = '{0}2{0}{1}'.format(y, N) 2687 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2688 elif N > M: 2689 # If the outer cast is an up-cast, we have to be more careful about the 2690 # size of the argument of the inner cast and with types. In this case, 2691 # the type is always the type of type up-cast which is given by the 2692 # outer cast. 2693 for P in type_sizes('uint'): 2694 # We can't optimize away up-cast of down-cast. 2695 if M < P: 2696 continue 2697 2698 # Because we're doing down-cast of down-cast, the types always have 2699 # to match between the two casts 2700 for x in ['i', 'u']: 2701 x2xN = '{0}2{0}{1}'.format(x, N) 2702 x2xM = '{0}2{0}{1}'.format(x, M) 2703 aP = 'a@{0}'.format(P) 2704 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2705 else: 2706 # The N == M case is handled by other optimizations 2707 pass 2708 2709# Downcast operations should be able to see through pack 2710for t in ['i', 'u']: 2711 for N in [8, 16, 32]: 2712 x2xN = '{0}2{0}{1}'.format(t, N) 2713 optimizations += [ 2714 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2715 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2716 ] 2717 2718# Optimize comparisons with up-casts 2719for t in ['int', 'uint', 'float']: 2720 for N, M in itertools.product(type_sizes(t), repeat=2): 2721 if N == 1 or N >= M: 2722 continue 2723 2724 cond = 'true' 2725 if N == 8: 2726 cond = 'options->support_8bit_alu' 2727 elif N == 16: 2728 cond = 'options->support_16bit_alu' 2729 x2xM = '{0}2{0}{1}'.format(t[0], M) 2730 x2xN = '{0}2{0}{1}'.format(t[0], N) 2731 aN = 'a@' + str(N) 2732 bN = 'b@' + str(N) 2733 xeq = 'feq' if t == 'float' else 'ieq' 2734 xne = 'fneu' if t == 'float' else 'ine' 2735 xge = '{0}ge'.format(t[0]) 2736 xlt = '{0}lt'.format(t[0]) 2737 2738 # Up-casts are lossless so for correctly signed comparisons of 2739 # up-casted values we can do the comparison at the largest of the two 2740 # original sizes and drop one or both of the casts. (We have 2741 # optimizations to drop the no-op casts which this may generate.) 2742 for P in type_sizes(t): 2743 if P == 1 or P > N: 2744 continue 2745 2746 bP = 'b@' + str(P) 2747 optimizations += [ 2748 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2749 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2750 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2751 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2752 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2753 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2754 ] 2755 2756 # The next bit doesn't work on floats because the range checks would 2757 # get way too complicated. 2758 if t in ['int', 'uint']: 2759 if t == 'int': 2760 xN_min = -(1 << (N - 1)) 2761 xN_max = (1 << (N - 1)) - 1 2762 elif t == 'uint': 2763 xN_min = 0 2764 xN_max = (1 << N) - 1 2765 else: 2766 assert False 2767 2768 # If we're up-casting and comparing to a constant, we can unfold 2769 # the comparison into a comparison with the shrunk down constant 2770 # and a check that the constant fits in the smaller bit size. 2771 optimizations += [ 2772 ((xeq, (x2xM, aN), '#b'), 2773 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2774 ((xne, (x2xM, aN), '#b'), 2775 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2776 ((xlt, (x2xM, aN), '#b'), 2777 ('iand', (xlt, xN_min, b), 2778 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2779 ((xlt, '#a', (x2xM, bN)), 2780 ('iand', (xlt, a, xN_max), 2781 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2782 ((xge, (x2xM, aN), '#b'), 2783 ('iand', (xge, xN_max, b), 2784 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2785 ((xge, '#a', (x2xM, bN)), 2786 ('iand', (xge, a, xN_min), 2787 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2788 ] 2789 2790# Convert masking followed by signed downcast to just unsigned downcast 2791optimizations += [ 2792 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2793 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2794 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2795 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2796 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2797 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2798] 2799 2800# Some operations such as iadd have the property that the bottom N bits of the 2801# output only depends on the bottom N bits of each of the inputs so we can 2802# remove casts 2803for N in [16, 32]: 2804 for M in [8, 16]: 2805 if M >= N: 2806 continue 2807 2808 aN = 'a@' + str(N) 2809 u2uM = 'u2u{0}'.format(M) 2810 i2iM = 'i2i{0}'.format(M) 2811 2812 for x in ['u', 'i']: 2813 x2xN = '{0}2{0}{1}'.format(x, N) 2814 extract_xM = 'extract_{0}{1}'.format(x, M) 2815 2816 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2817 extract_xM_M_bits = \ 2818 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2819 optimizations += [ 2820 ((x2xN_M_bits, (u2uM, aN)), a), 2821 ((extract_xM_M_bits, aN, 0), a), 2822 ] 2823 2824 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2825 optimizations += [ 2826 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2827 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2828 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2829 ] 2830 2831 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2832 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2833 optimizations += [ 2834 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2835 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2836 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2837 ] 2838 2839def fexp2i(exp, bits): 2840 # Generate an expression which constructs value 2.0^exp or 0.0. 2841 # 2842 # We assume that exp is already in a valid range: 2843 # 2844 # * [-15, 15] for 16-bit float 2845 # * [-127, 127] for 32-bit float 2846 # * [-1023, 1023] for 16-bit float 2847 # 2848 # If exp is the lowest value in the valid range, a value of 0.0 is 2849 # constructed. Otherwise, the value 2.0^exp is constructed. 2850 if bits == 16: 2851 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2852 elif bits == 32: 2853 return ('ishl', ('iadd', exp, 127), 23) 2854 elif bits == 64: 2855 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2856 else: 2857 assert False 2858 2859def ldexp(f, exp, bits): 2860 # The maximum possible range for a normal exponent is [-126, 127] and, 2861 # throwing in denormals, you get a maximum range of [-149, 127]. This 2862 # means that we can potentially have a swing of +-276. If you start with 2863 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2864 # all the way to zero. The GLSL spec only requires that we handle a subset 2865 # of this range. From version 4.60 of the spec: 2866 # 2867 # "If exp is greater than +128 (single-precision) or +1024 2868 # (double-precision), the value returned is undefined. If exp is less 2869 # than -126 (single-precision) or -1022 (double-precision), the value 2870 # returned may be flushed to zero. Additionally, splitting the value 2871 # into a significand and exponent using frexp() and then reconstructing 2872 # a floating-point value using ldexp() should yield the original input 2873 # for zero and all finite non-denormalized values." 2874 # 2875 # The SPIR-V spec has similar language. 2876 # 2877 # In order to handle the maximum value +128 using the fexp2i() helper 2878 # above, we have to split the exponent in half and do two multiply 2879 # operations. 2880 # 2881 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2882 # twice the full range that is valid for the fexp2i() function above. If 2883 # exp/2 is the bottom value of that range, the fexp2i() expression will 2884 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2885 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2886 # value is clamped from above, then it must have been above the supported 2887 # range of the GLSL built-in and therefore any return value is acceptable. 2888 if bits == 16: 2889 exp = ('imin', ('imax', exp, -30), 30) 2890 elif bits == 32: 2891 exp = ('imin', ('imax', exp, -254), 254) 2892 elif bits == 64: 2893 exp = ('imin', ('imax', exp, -2046), 2046) 2894 else: 2895 assert False 2896 2897 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2898 # (We use ishr which isn't the same for -1, but the -1 case still works 2899 # since we use exp-exp/2 as the second exponent.) While the spec 2900 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2901 # work with denormals and doesn't allow for the full swing in exponents 2902 # that you can get with normalized values. Instead, we create two powers 2903 # of two and multiply by them each in turn. That way the effective range 2904 # of our exponent is doubled. 2905 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2906 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2907 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2908 2909optimizations += [ 2910 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2911 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2912 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2913] 2914 2915# XCOM 2 (OpenGL) open-codes bitfieldReverse() 2916def bitfield_reverse_xcom2(u): 2917 step1 = ('iadd', ('ishl', u, 16), ('ushr', u, 16)) 2918 step2 = ('iadd', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2919 step3 = ('iadd', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2920 step4 = ('iadd', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2921 step5 = ('iadd(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2922 2923 return step5 2924 2925# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2926def bitfield_reverse_ue4(u): 2927 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2928 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2929 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2930 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2931 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2932 2933 return step5 2934 2935# Cyberpunk 2077 open-codes bitfieldReverse() 2936def bitfield_reverse_cp2077(u): 2937 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2938 step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2939 step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2940 step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2941 step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2942 2943 return step5 2944 2945optimizations += [(bitfield_reverse_xcom2('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2946optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2947optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2948 2949# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16. 2950# Because the spec does not specify a rounding mode or behaviour regarding infinity, 2951# it emits a sequence to ensure D3D-like behaviour for infinity. 2952# When we know the current backend already behaves like we need, we can eliminate the extra sequence. 2953# 2954# Input is f32, output is u32 that has the f16 packed into its low bits. 2955def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a): 2956 packed_half = ('pack_half_2x16_rtz_split', a, 0) 2957 packed_half_minus1 = ('iadd', packed_half, 0xffffffff) 2958 f32_was_not_inf = ('ine', abs_a, 0x7f800000) 2959 f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00) 2960 return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half) 2961 2962optimizations += [ 2963 (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)), 2964 (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)), 2965 (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)), 2966] 2967 2968def vkd3d_proton_msad(): 2969 pattern = None 2970 for i in range(4): 2971 ref = ('extract_u8', 'a@32', i) 2972 src = ('extract_u8', 'b@32', i) 2973 sad = ('iabs', ('iadd', ref, ('ineg', src))) 2974 msad = ('bcsel', ('ieq', ref, 0), 0, sad) 2975 if pattern == None: 2976 pattern = msad 2977 else: 2978 pattern = ('iadd', pattern, msad) 2979 pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:]) 2980 return pattern 2981 2982optimizations += [ 2983 (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'), 2984 (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)), 2985] 2986 2987 2988# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2989# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2990for ncomp in [2, 3, 4, 8, 16]: 2991 optimizations += [ 2992 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2993 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2994 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2995 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2996 ] 2997 2998# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2999# then the "a == a" is redundant because it's equivalent to "a is not NaN" 3000# and, if a is a NaN then the second comparison will fail anyway. 3001for op in ['flt', 'fge', 'feq']: 3002 optimizations += [ 3003 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 3004 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 3005 ] 3006 3007# Add optimizations to handle the case where the result of a ternary is 3008# compared to a constant. This way we can take things like 3009# 3010# (a ? 0 : 1) > 0 3011# 3012# and turn it into 3013# 3014# a ? (0 > 0) : (1 > 0) 3015# 3016# which constant folding will eat for lunch. The resulting ternary will 3017# further get cleaned up by the boolean reductions above and we will be 3018# left with just the original variable "a". 3019for op in ['feq', 'fneu', 'ieq', 'ine']: 3020 optimizations += [ 3021 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 3022 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 3023 ] 3024 3025for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 3026 optimizations += [ 3027 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 3028 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 3029 ((op, '#d', ('bcsel', a, '#b', '#c')), 3030 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 3031 ] 3032 3033 3034# For example, this converts things like 3035# 3036# 1 + mix(0, a - 1, condition) 3037# 3038# into 3039# 3040# mix(1, (a-1)+1, condition) 3041# 3042# Other optimizations will rearrange the constants. 3043for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 3044 optimizations += [ 3045 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 3046 ] 3047 3048# Some optimizations for ir3-specific instructions. 3049optimizations += [ 3050 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 3051 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 3052 # '(al * bh) << 16 + c': If either 'al' or 'bh' is zero, return 'c'. 3053 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 3054 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 3055] 3056 3057# These kinds of sequences can occur after nir_opt_peephole_select. 3058# 3059# NOTE: fadd is not handled here because that gets in the way of ffma 3060# generation in the i965 driver. Instead, fadd and ffma are handled in 3061# late_optimizations. 3062 3063for op in ['flrp']: 3064 optimizations += [ 3065 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3066 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3067 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3068 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3069 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 3070 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 3071 ] 3072 3073for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 3074 optimizations += [ 3075 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 3076 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3077 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3078 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 3079 ] 3080 3081for op in ['fpow']: 3082 optimizations += [ 3083 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3084 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3085 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 3086 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 3087 ] 3088 3089for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']: 3090 optimizations += [ 3091 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 3092 ] 3093 3094for op in ['ineg', 'iabs', 'inot', 'isign']: 3095 optimizations += [ 3096 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 3097 ] 3098 3099optimizations.extend([ 3100 (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 3101 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 3102 (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 3103 ]) 3104 3105 3106""" 3107 if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16) 3108 return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */; 3109 else 3110 return f2f32(f2f16(val)); 3111""" 3112optimizations.extend([ 3113 (('fquantize2f16', 'a@32'), 3114 ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)), 3115 ('iand', a, 1 << 31), 3116 ('!f2f32', ('!f2f16_rtne', a))), 3117 'options->lower_fquantize2f16') 3118 ]) 3119 3120for s in range(0, 31): 3121 mask = 0xffffffff << s 3122 3123 # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior 3124 # will never both have the same bits set, replacing the ior with an iadd 3125 # is safe (i.e., a carry out of a bit can never be generated). The iadd is 3126 # more likely to participate in other optimization patterns (e.g., iadd of 3127 # constant reassociation) 3128 optimizations.extend([ 3129 (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)), 3130 'options->avoid_ternary_with_two_constants'), 3131 ]) 3132 3133# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN. 3134# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here) 3135for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']: 3136 optimizations += [((op, '#a(is_nan)', b), NAN)] 3137 optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative 3138 3139# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN. 3140for op in ['ffma', 'flrp']: 3141 optimizations += [((op, '#a(is_nan)', b, c), NAN)] 3142 optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative 3143 optimizations += [((op, a, b, '#c(is_nan)'), NAN)] 3144 3145# NaN propagation: FP min/max. Pick the non-NaN operand. 3146for op in ['fmin', 'fmax']: 3147 optimizations += [((op, '#a(is_nan)', b), b)] # commutative 3148 3149# NaN propagation: ldexp is NaN if the first operand is NaN. 3150optimizations += [(('ldexp', '#a(is_nan)', b), NAN)] 3151 3152# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN. 3153for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']: 3154 optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative 3155 3156# NaN propagation: FP comparison opcodes except !=. Replace it with false. 3157for op in ['feq', 'fge', 'flt']: 3158 optimizations += [((op, '#a(is_nan)', b), False)] 3159 optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative 3160 3161# NaN propagation: FP comparison opcodes using !=. Replace it with true. 3162# Operator != is the only opcode where a comparison with NaN returns true. 3163for op in ['fneu']: 3164 optimizations += [((op, '#a(is_nan)', b), True)] # commutative 3165 3166# NaN propagation: FP comparison opcodes except != returning FP 0 or 1. 3167for op in ['seq', 'sge', 'slt']: 3168 optimizations += [((op, '#a(is_nan)', b), 0.0)] 3169 optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative 3170 3171# NaN propagation: FP comparison opcodes using != returning FP 0 or 1. 3172# Operator != is the only opcode where a comparison with NaN returns true. 3173optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative 3174 3175# This section contains optimizations to propagate downsizing conversions of 3176# constructed vectors into vectors of downsized components. Whether this is 3177# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 3178# this reduces the register pressure of the vector itself and often enables the 3179# conversions to be eliminated via other algebraic rules or constant folding. 3180# In the worst case on a SIMD architecture, the propagated conversions may be 3181# revectorized via nir_opt_vectorize so instruction count is minimally 3182# impacted. 3183# 3184# On a machine with SIMD-within-a-register only, this actually 3185# counterintuitively hurts instruction count. These machines are the same that 3186# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 3187# not being set. 3188# 3189# Finally for scalar architectures, there should be no difference in generated 3190# code since it all ends up scalarized at the end, but it might minimally help 3191# compile-times. 3192 3193for i in range(2, 4 + 1): 3194 for T in ('f', 'u', 'i'): 3195 vec_inst = ('vec' + str(i),) 3196 3197 indices = ['a', 'b', 'c', 'd'] 3198 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 3199 3200 to_16 = '{}2{}16'.format(T, T) 3201 to_mp = '{}2{}mp'.format(T, T) 3202 3203 out_16 = tuple((to_16, indices[j]) for j in range(i)) 3204 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 3205 3206 optimizations += [ 3207 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 3208 ] 3209 # u2ump doesn't exist, because it's equal to i2imp 3210 if T in ['f', 'i']: 3211 optimizations += [ 3212 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 3213 ] 3214 3215# This section contains "late" optimizations that should be run before 3216# creating ffmas and calling regular optimizations for the final time. 3217# Optimizations should go here if they help code generation and conflict 3218# with the regular optimizations. 3219before_ffma_optimizations = [ 3220 # Propagate constants down multiplication chains 3221 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 3222 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 3223 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 3224 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 3225 3226 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 3227 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 3228 (('~fadd', ('fneg', a), a), 0.0), 3229 (('iadd', ('ineg', a), a), 0), 3230 (('iadd', ('ineg', a), ('iadd', a, b)), b), 3231 (('iadd', a, ('iadd', ('ineg', a), b)), b), 3232 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 3233 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 3234 3235 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 3236 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 3237 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 3238] 3239 3240# This section contains "late" optimizations that should be run after the 3241# regular optimizations have finished. Optimizations should go here if 3242# they help code generation but do not necessarily produce code that is 3243# more easily optimizable. 3244late_optimizations = [ 3245 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 3246 # results if one operand is +Inf and the other is -Inf. 3247 # 3248 # 1. Inf + -Inf = NaN 3249 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 3250 # 3. ∀x: x != NaN = true 3251 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 3252 # 3253 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 3254 # (a+b) < 0 false false false false 3255 # a < -b false false false false 3256 # -(a+b) < 0 false false false false 3257 # -a < b false false false false 3258 # (a+b) >= 0 false false false false 3259 # a >= -b true true false false 3260 # -(a+b) >= 0 false false false false 3261 # -a >= b true true false false 3262 # (a+b) == 0 false false false false 3263 # a == -b true true false false 3264 # (a+b) != 0 true true true true 3265 # a != -b false false true true 3266 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 3267 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 3268 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 3269 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 3270 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 3271 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 3272 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 3273 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 3274 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 3275 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 3276 3277 # If either source must be finite, then the original (a+b) cannot produce 3278 # NaN due to Inf-Inf. The patterns and the replacements produce the same 3279 # result if b is NaN. Therefore, the replacements are exact. 3280 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 3281 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 3282 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 3283 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 3284 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 3285 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 3286 3287 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 3288 # SpvOpLessOrGreater. 3289 *add_fabs_fneg((('iand', ('fneu', 'ma', 'mb'), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma'))), {'ma' : a, 'mb' : b}), 3290 (('iand', ('fneu', a, 0.0), ('feq', a, a)), ('!flt', 0.0, ('fabs', a))), 3291 3292 # This is how SpvOpFUnordEqual might be implemented. Replace it with 3293 # !SpvOpLessOrGreater. 3294 *add_fabs_fneg((('ior', ('feq', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma')))), {'ma' : a, 'mb' : b}), 3295 (('ior', ('feq', a, 0.0), ('fneu', a, a)), ('inot', ('!flt', 0.0, ('fabs', a)))), 3296 3297 *add_fabs_fneg((('ior', ('flt', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('fge', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3298 *add_fabs_fneg((('ior', ('fge', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('flt', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3299 *add_fabs_fneg((('ior', ('flt', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('fge', 'ma', b))), {'ma' : a}), 3300 *add_fabs_fneg((('ior', ('fge', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('flt', 'ma', b))), {'ma' : a}), 3301 *add_fabs_fneg((('ior', ('flt', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('fge', a, 'mb'))), {'mb' : b}), 3302 *add_fabs_fneg((('ior', ('fge', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('flt', a, 'mb'))), {'mb' : b}), 3303 *add_fabs_fneg((('iand', ('fneu', 'ma', 'b(is_a_number)'), ('feq', a, a)), ('fneo', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3304 *add_fabs_fneg((('ior', ('feq', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('fequ', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3305 3306 (('ior', ('flt', a, b), ('flt', b, a)), ('fneo', a, b), 'options->has_fneo_fcmpu'), 3307 (('flt', 0.0, ('fabs', a)), ('fneo', 0.0, a), 'options->has_fneo_fcmpu'), 3308 3309 3310 # These don't interfere with the previous optimizations which include this 3311 # in the search expression, because nir_algebraic_impl visits instructions 3312 # in reverse order. 3313 (('ior', ('fneu', 'a@16', a), ('fneu', 'b@16', b)), ('funord', a, b), 'options->has_ford_funord'), 3314 (('iand', ('feq', 'a@16', a), ('feq', 'b@16', b)), ('ford', a, b), 'options->has_ford_funord'), 3315 (('ior', ('fneu', 'a@32', a), ('fneu', 'b@32', b)), ('funord', a, b), 'options->has_ford_funord'), 3316 (('iand', ('feq', 'a@32', a), ('feq', 'b@32', b)), ('ford', a, b), 'options->has_ford_funord'), 3317 (('ior', ('fneu', 'a@64', a), ('fneu', 'b@64', b)), ('funord', a, b), 'options->has_ford_funord'), 3318 (('iand', ('feq', 'a@64', a), ('feq', 'b@64', b)), ('ford', a, b), 'options->has_ford_funord'), 3319 3320 (('inot', ('ford(is_used_once)', a, b)), ('funord', a, b)), 3321 (('inot', ('funord(is_used_once)', a, b)), ('ford', a, b)), 3322 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 3323 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 3324 (('inot', ('fequ(is_used_once)', a, b)), ('fneo', a, b)), 3325 (('inot', ('fneo(is_used_once)', a, b)), ('fequ', a, b)), 3326 (('inot', ('flt(is_used_once)', a, b)), ('fgeu', a, b), 'options->has_fneo_fcmpu'), 3327 (('inot', ('fgeu(is_used_once)', a, b)), ('flt', a, b)), 3328 (('inot', ('fge(is_used_once)', a, b)), ('fltu', a, b), 'options->has_fneo_fcmpu'), 3329 (('inot', ('fltu(is_used_once)', a, b)), ('fge', a, b)), 3330 3331 # nir_lower_to_source_mods will collapse this, but its existence during the 3332 # optimization loop can prevent other optimizations. 3333 (('fneg', ('fneg', a)), a), 3334 3335 # combine imul and iadd to imad 3336 (('iadd@32', ('imul(is_only_used_by_iadd)', a, b), c), ('imad', a, b, c), 'options->has_imad32'), 3337 3338 # Drivers do not actually implement udiv_aligned_4, it is just used to 3339 # optimize scratch lowering. 3340 (('udiv_aligned_4', a), ('ushr', a, 2)), 3341] 3342 3343# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 3344# gets combined to fma(a, b, -c). 3345for sz, mulz in itertools.product([16, 32, 64], [False, True]): 3346 # fmulz/ffmaz only for fp32 3347 if mulz and sz != 32: 3348 continue 3349 3350 # Fuse the correct fmul. Only consider fmuls where the only users are fadd 3351 # (or fneg/fabs which are assumed to be propagated away), as a heuristic to 3352 # avoid fusing in cases where it's harmful. 3353 fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)' 3354 ffma = 'ffmaz' if mulz else 'ffma' 3355 3356 fadd = '~fadd@{}'.format(sz) 3357 option = 'options->fuse_ffma{}'.format(sz) 3358 3359 late_optimizations.extend([ 3360 ((fadd, (fmul, a, b), c), (ffma, a, b, c), option), 3361 3362 ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c), 3363 (ffma, ('fneg', a), b, c), option), 3364 3365 ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c), 3366 (ffma, ('fabs', a), ('fabs', b), c), option), 3367 3368 ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c), 3369 (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option), 3370 ]) 3371 3372late_optimizations.extend([ 3373 # Subtractions get lowered during optimization, so we need to recombine them 3374 (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3375 (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3376 (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3377 (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 3378 3379 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 3380 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 3381 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 3382 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 3383]) 3384 3385for s in [8, 16, 32, 64]: 3386 cond = 'options->has_iadd3' 3387 if s == 64: 3388 cond += ' && !(options->lower_int64_options & nir_lower_iadd3_64)' 3389 3390 iadd = "iadd@{}".format(s) 3391 3392 # On Intel GPUs, the constant field for an ADD3 instruction must be either 3393 # int16_t or uint16_t. 3394 late_optimizations.extend([ 3395 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3396 ((iadd, ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3397 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'), ('iadd3', a, b, c), cond), 3398 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3399 ((iadd, ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3400 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3401 3402 ((iadd, ('ishl', a, 1), 'b(is_not_const)'), ('iadd3', a, a, b), cond), 3403 ((iadd, ('ishl', a, 1), '#b(is_16_bits)' ), ('iadd3', a, a, b), cond), 3404 ((iadd, ('ineg', ('ishl', a, 1)), 'b(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3405 ((iadd, ('ineg', ('ishl', a, 1)), '#b(is_16_bits)' ), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3406 3407 # Use special checks to ensure (b+b) or -(b+b) fit in 16 bits. 3408 (('ishl@{}'.format(s), ('iadd', a, '#b(is_2x_16_bits)'), 1), ('iadd3', a, a, ('iadd', b, b)), cond), 3409 (('ishl@{}'.format(s), ('ineg', ('iadd', a, '#b(is_neg2x_16_bits)')), 1), ('iadd3', ('ineg', a), ('ineg', a), ('ineg', ('iadd', b, b))), cond), 3410 ]) 3411 3412late_optimizations.extend([ 3413 # fneg_lo / fneg_hi 3414 (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 3415 (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 3416 3417 # These are duplicated from the main optimizations table. The late 3418 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 3419 # new patterns like these. The patterns that compare with zero are removed 3420 # because they are unlikely to be created in by anything in 3421 # late_optimizations. 3422 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 3423 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 3424 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 3425 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 3426 3427 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 3428 3429 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 3430 3431 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 3432 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 3433 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 3434 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 3435 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 3436 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 3437 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 3438 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 3439 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 3440 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 3441 3442 (('ior', a, a), a), 3443 (('iand', a, a), a), 3444 3445 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 3446 3447 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 3448 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 3449 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 3450 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 3451 3452 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 3453 3454 # Approximate handling of fround_even for DX9 addressing from gallium nine on 3455 # DX9-class hardware with no proper fround support. This is in 3456 # late_optimizations so that the is_integral() opts in the main pass get a 3457 # chance to eliminate the fround_even first. 3458 (('fround_even', a), ('bcsel', 3459 ('feq', ('ffract', a), 0.5), 3460 ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 3461 ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 3462 3463 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 3464 # particular operation is common for expanding values stored in a texture 3465 # from [0,1] to [-1,1]. 3466 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3467 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3468 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3469 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3470 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3471 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3472 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3473 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3474 3475 # flrp(a, b, a) 3476 # a*(1-a) + b*a 3477 # a + -a*a + a*b (1) 3478 # a + a*(b - a) 3479 # Option 1: ffma(a, (b-a), a) 3480 # 3481 # Alternately, after (1): 3482 # a*(1+b) + -a*a 3483 # a*((1+b) + -a) 3484 # 3485 # Let b=1 3486 # 3487 # Option 2: ffma(a, 2, -(a*a)) 3488 # Option 3: ffma(a, 2, (-a)*a) 3489 # Option 4: ffma(a, -a, (2*a) 3490 # Option 5: a * (2 - a) 3491 # 3492 # There are a lot of other possible combinations. 3493 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 3494 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3495 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3496 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3497 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3498 3499 # we do these late so that we don't get in the way of creating ffmas 3500 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 3501 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 3502 3503 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 3504 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 3505 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 3506 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 3507 3508 # Things that look like DPH in the source shader may get expanded to 3509 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 3510 # to NIR. After FFMA is generated, this can look like: 3511 # 3512 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 3513 # 3514 # Reassociate the last addition into the first multiplication. 3515 # 3516 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 3517 # shader stages on some outputs that are intended to be invariant. For 3518 # various reasons, this optimization may not be fully applied in all 3519 # shaders used for different rendering passes of the same geometry. This 3520 # can result in Z-fighting artifacts (at best). For now, disable this 3521 # optimization in these stages. See bugzilla #111490. In tessellation 3522 # stages applications seem to use 'precise' when necessary, so allow the 3523 # optimization in those stages. 3524 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma(is_used_once)', c, d, ('ffma', e, 'f', ('fmul(is_used_once)', 'g(is_not_const_and_not_fsign)', 'h(is_not_const_and_not_fsign)')))), 'i(is_not_const)'), 3525 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', ('ffma', 'g', 'h', 'i')))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3526 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3527 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3528 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3529 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3530 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3531 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3532 3533 (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3534 ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3535 (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3536 ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3537 (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3538 ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3539 3540 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 3541 # 3542 # If bits is zero, the result will be zero. 3543 # 3544 # These prevent the next two lowerings generating incorrect results when 3545 # count is zero. 3546 (('ubfe', a, b, 0), 0), 3547 (('ibfe', a, b, 0), 0), 3548 3549 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 3550 # instructions on Intel GPUs, it cannot have an immediate values as 3551 # sources. There are also limitations on source register strides. As a 3552 # result, it is very easy for 3-source instruction combined with either 3553 # loads of immediate values or copies from weird register strides to be 3554 # more expensive than the primitive instructions it represents. 3555 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 3556 3557 # b is the lowest order bit to be extracted and c is the number of bits to 3558 # extract. The inner shift removes the bits above b + c by shifting left 3559 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 3560 # -(b + c). The outer shift moves the bit that was at b to bit zero. 3561 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 3562 # This means that it must be shifted right by 32 - c or -c bits. 3563 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 3564 3565 # Clean up no-op shifts that may result from the bfe lowerings. 3566 (('ishl', a, 0), a), 3567 (('ishl', a, -32), a), 3568 (('ishr', a, 0), a), 3569 (('ishr', a, -32), a), 3570 (('ushr', a, 0), a), 3571 3572 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 3573 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 3574 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 3575 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 3576 3577 # open coded bit test 3578 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3579 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3580 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3581 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3582 (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'), 3583 (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'), 3584 (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'), 3585 (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'), 3586 (('bitz', ('ushr', a, b), 0), ('bitz', a, b)), 3587 (('bitz', ('ishr', a, b), 0), ('bitz', a, b)), 3588 (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)), 3589 (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)), 3590 (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3591 (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3592 (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'), 3593 (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'), 3594 (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3595 (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3596 (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'), 3597 (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'), 3598 (('inot', ('bitnz', a, b)), ('bitz', a, b)), 3599 (('inot', ('bitz', a, b)), ('bitnz', a, b)), 3600 (('bitnz', ('inot', a), b), ('bitz', a, b)), 3601 (('bitz', ('inot', a), b), ('bitnz', a, b)), 3602]) 3603 3604# A few more extract cases we'd rather leave late 3605for N in [16, 32]: 3606 aN = 'a@{0}'.format(N) 3607 u2uM = 'u2u{0}'.format(M) 3608 i2iM = 'i2i{0}'.format(M) 3609 3610 for x in ['u', 'i']: 3611 x2xN = '{0}2{0}{1}'.format(x, N) 3612 extract_x8 = 'extract_{0}8'.format(x) 3613 extract_x16 = 'extract_{0}16'.format(x) 3614 3615 late_optimizations.extend([ 3616 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3617 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3618 ]) 3619 3620 if N > 16: 3621 late_optimizations.extend([ 3622 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3623 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3624 ]) 3625 3626# Byte insertion 3627late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3628late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3629late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 3630 3631late_optimizations += [ 3632 # Word insertion 3633 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 3634 3635 # Extract and then insert 3636 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 3637 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 3638] 3639 3640# Float sizes 3641for s in [16, 32, 64]: 3642 late_optimizations.extend([ 3643 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 3644 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 3645 ]) 3646 3647for op in ['fadd']: 3648 late_optimizations += [ 3649 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3650 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3651 ] 3652 3653for op in ['ffma', 'ffmaz']: 3654 late_optimizations += [ 3655 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3656 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3657 3658 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3659 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3660 ] 3661 3662# mediump: If an opcode is surrounded by conversions, remove the conversions. 3663# The rationale is that type conversions + the low precision opcode are more 3664# expensive that the same arithmetic opcode at higher precision. 3665# 3666# This must be done in late optimizations, because we need normal optimizations to 3667# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 3668# 3669# Unary opcodes 3670for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 3671 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 3672 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 3673 3674# Binary opcodes 3675for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 3676 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 3677 3678# Ternary opcodes 3679for op in ['ffma', 'flrp']: 3680 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 3681 3682# Comparison opcodes 3683for op in ['feq', 'fge', 'flt', 'fneu']: 3684 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 3685 3686# Do this last, so that the f2fmp patterns above have effect. 3687late_optimizations += [ 3688 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 3689 # any conversions that could have been removed will have been removed in 3690 # nir_opt_algebraic so any remaining ones are required. 3691 (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"), 3692 (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"), 3693 (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"), 3694 (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"), 3695 (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"), 3696 (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"), 3697 (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"), 3698 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 3699 3700 (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"), 3701 3702 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3703 (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3704 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3705 (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3706 3707 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3708 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3709 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3710 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3711 3712 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3713 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3714 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3715 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3716] 3717 3718for s in [16, 32, 64]: 3719 late_optimizations.extend([ 3720 (('bcsel@{}'.format(s), ('ieq', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, b, c), "options->has_icsel_eqz{} && !options->no_integers".format(s)), 3721 (('bcsel@{}'.format(s), ('ine', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, c, b), "options->has_icsel_eqz{} && !options->no_integers".format(s)), 3722 ]) 3723 3724distribute_src_mods = [ 3725 # Try to remove some spurious negations rather than pushing them down. 3726 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 3727 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 3728 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 3729 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 3730 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 3731 (('fneg', ('fneg', a)), a), 3732 3733 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 3734 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 3735 3736 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 3737 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 3738 (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 3739 3740 # Note that fmin <-> fmax. I don't think there is a way to distribute 3741 # fabs() into fmin or fmax. 3742 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 3743 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 3744 3745 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 3746 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 3747 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 3748 3749 # fdph works mostly like fdot, but to get the correct result, the negation 3750 # must be applied to the second source. 3751 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 3752 3753 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 3754 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 3755] 3756 3757before_lower_int64_optimizations = [ 3758 # The i2i64(a) implies that 'a' has at most 32-bits of data. 3759 (('ishl', ('i2i64', a), b), 3760 # Effective shift count of zero, just return 'a'. 3761 ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a), 3762 ('bcsel', ('ilt', ('iand', b, 63), 32), 3763 # Shifting less than 32 bits, so both 32-bit halves will have 3764 # some data. These (and the else case) shift counts are of 32-bit 3765 # values, so the shift counts are implicitly moduolo 32. 3766 ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a), ('iadd', ('ineg', b), 32) )), 3767 # Shifting 32 bits or more, so lower 32 bits must be zero. 3768 ('pack_64_2x32_split', 0 , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3769 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3770 3771 (('ishl', ('u2u64', a), b), 3772 ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a), 3773 ('bcsel', ('ilt', ('iand', b, 63), 32), 3774 ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a), ('iadd', ('ineg', b), 32) )), 3775 ('pack_64_2x32_split', 0 , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3776 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3777 3778 # If ineg64 is lowered, then the negation is not free. Try to eliminate 3779 # some of the negations. 3780 (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3781 (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3782 (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3783 (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3784 3785 (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)), 3786 (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)), 3787 3788 # If the hardware can do int64, the shift is the same cost as the add. It 3789 # should be fine to do this transformation unconditionally. 3790 (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)), 3791 (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)), 3792] 3793 3794parser = argparse.ArgumentParser() 3795parser.add_argument('--out', required=True) 3796args = parser.parse_args() 3797 3798with open(args.out, "w", encoding='utf-8') as f: 3799 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 3800 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 3801 before_ffma_optimizations).render()) 3802 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64", 3803 before_lower_int64_optimizations).render()) 3804 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 3805 late_optimizations).render()) 3806 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 3807 distribute_src_mods).render()) 3808