1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23 24from collections import OrderedDict 25import nir_algebraic 26from nir_opcodes import type_sizes 27import itertools 28import struct 29from math import pi 30import math 31 32# Convenience variables 33a = 'a' 34b = 'b' 35c = 'c' 36d = 'd' 37e = 'e' 38NAN = math.nan 39 40signed_zero_preserve_16 = 'nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 16)' 41signed_zero_preserve_32 = 'nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 32)' 42signed_zero_nan_preserve_16 = ('(nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 16) ||' 43 ' nir_is_float_control_nan_preserve(info->float_controls_execution_mode, 16))') 44signed_zero_nan_preserve_32 = ('(nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 32) ||' 45 ' nir_is_float_control_nan_preserve(info->float_controls_execution_mode, 32))') 46signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)' 47signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)' 48 49has_fmulz = '(options->has_fmulz || \ 50 (options->has_fmulz_no_denorms && \ 51 !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))' 52 53ignore_exact = nir_algebraic.ignore_exact 54 55# Written in the form (<search>, <replace>) where <search> is an expression 56# and <replace> is either an expression or a value. An expression is 57# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 58# where each source is either an expression or a value. A value can be 59# either a numeric constant or a string representing a variable name. 60# 61# If the opcode in a search expression is prefixed by a '~' character, this 62# indicates that the operation is inexact. Such operations will only get 63# applied to SSA values that do not have the exact bit set. This should be 64# used by by any optimizations that are not bit-for-bit exact. It should not, 65# however, be used for backend-requested lowering operations as those need to 66# happen regardless of precision. 67# 68# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 69# "#" indicates that the given variable will only match constants, 70# type indicates that the given variable will only match values from ALU 71# instructions with the given output type, 72# (cond) specifies an additional condition function (see nir_search_helpers.h), 73# swiz is a swizzle applied to the variable (only in the <replace> expression) 74# 75# For constants, you have to be careful to make sure that it is the right 76# type because python is unaware of the source and destination types of the 77# opcodes. 78# 79# All expression types can have a bit-size specified. For opcodes, this 80# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 81# type and size. In the search half of the expression this indicates that it 82# should only match that particular bit-size. In the replace half of the 83# expression this indicates that the constructed value should have that 84# bit-size. 85# 86# If the opcode in a replacement expression is prefixed by a '!' character, 87# this indicated that the new expression will be marked exact. 88# 89# A special condition "many-comm-expr" can be used with expressions to note 90# that the expression and its subexpressions have more commutative expressions 91# than nir_replace_instr can handle. If this special condition is needed with 92# another condition, the two can be separated by a comma (e.g., 93# "(many-comm-expr,is_used_once)"). 94 95# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 96def lowered_sincos(c): 97 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 98 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 99 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 100 101def intBitsToFloat(i): 102 return struct.unpack('!f', struct.pack('!I', i))[0] 103 104optimizations = [ 105 106 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 107 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 108 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 109 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 110 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 111 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 112 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 113 114 (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 115 '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 116 117 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 118 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 119 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 120 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 121 (('udiv', a, 1), a), 122 (('idiv', a, 1), a), 123 (('umod', a, 1), 0), 124 (('imod', a, 1), 0), 125 (('imod', a, -1), 0), 126 (('irem', a, 1), 0), 127 (('irem', a, -1), 0), 128 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 129 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 130 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 131 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 132 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 133 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 134 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 135 (('irem', a, '#b(is_pos_power_of_two)'), 136 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 137 '!options->lower_bitops'), 138 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 139 140 (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'), 141 142 (('~fneg', ('fneg', a)), a), 143 (('ineg', ('ineg', a)), a), 144 (('fabs', ('fneg', a)), ('fabs', a)), 145 (('fabs', ('u2f', a)), ('u2f', a)), 146 (('iabs', ('iabs', a)), ('iabs', a)), 147 (('iabs', ('ineg', a)), ('iabs', a)), 148 (('~fadd', a, 0.0), a), 149 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 150 # floating point instruction, they should flush any input denormals and we 151 # can replace -0.0 with 0.0 if the float execution mode allows it. 152 (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_preserve_16), 153 (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_preserve_32), 154 (('iadd', a, 0), a), 155 (('iadd_sat', a, 0), a), 156 (('isub_sat', a, 0), a), 157 (('uadd_sat', a, 0), a), 158 (('usub_sat', a, 0), a), 159 (('usadd_4x8_vc4', a, 0), a), 160 (('usadd_4x8_vc4', a, ~0), ~0), 161 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 162 (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 163 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 164 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 165 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 166 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 167 (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 168 (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 169 (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 170 (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 171 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 172 (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)), 173 (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), 174 (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), 175 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)), 176 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)), 177 (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))), 178 (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))), 179 (('~fadd', ('fneg', a), a), 0.0), 180 (('iadd', ('ineg', a), a), 0), 181 (('iadd', ('ineg', a), ('iadd', a, b)), b), 182 (('iadd', a, ('iadd', ('ineg', a), b)), b), 183 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 184 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 185 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 186 (('~fmul', a, 0.0), 0.0), 187 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 188 (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_nan_preserve_16), 189 (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_nan_preserve_32), 190 (('fmulz', a, 0.0), 0.0), 191 (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_preserve_32), 192 (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 193 (('fmulz', a, a), ('fmul', a, a)), 194 (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_preserve_32), 195 (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 196 (('ffmaz', a, a, b), ('ffma', a, a, b)), 197 (('imul', a, 0), 0), 198 (('umul_unorm_4x8_vc4', a, 0), 0), 199 (('umul_unorm_4x8_vc4', a, ~0), a), 200 (('~fmul', a, 1.0), a), 201 (('~fmulz', a, 1.0), a), 202 # The only effect a*1.0 can have is flushing denormals. If it's only used by 203 # a floating point instruction, they should flush any input denormals and 204 # this multiplication isn't needed. 205 (('fmul(is_only_used_as_float)', a, 1.0), a), 206 (('imul', a, 1), a), 207 (('fmul', a, -1.0), ('fneg', a)), 208 (('imul', a, -1), ('ineg', a)), 209 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 210 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 211 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 212 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 213 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 214 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 215 (('~ffma', 0.0, a, b), b), 216 (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16), 217 (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32), 218 (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 219 (('~ffma', a, b, 0.0), ('fmul', a, b)), 220 (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_preserve_16), 221 (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_preserve_32), 222 (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_preserve_32), 223 (('ffma', 1.0, a, b), ('fadd', a, b)), 224 (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_preserve_32), 225 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 226 (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_preserve_32), 227 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 228 (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 229 (('~flrp', a, b, 0.0), a), 230 (('~flrp', a, b, 1.0), b), 231 (('~flrp', a, a, b), a), 232 (('~flrp', 0.0, a, b), ('fmul', a, b)), 233 234 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 235 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 236 237 (('sdot_4x8_iadd', a, 0, b), b), 238 (('udot_4x8_uadd', a, 0, b), b), 239 (('sdot_4x8_iadd_sat', a, 0, b), b), 240 (('udot_4x8_uadd_sat', a, 0, b), b), 241 (('sdot_2x16_iadd', a, 0, b), b), 242 (('udot_2x16_uadd', a, 0, b), b), 243 (('sdot_2x16_iadd_sat', a, 0, b), b), 244 (('udot_2x16_uadd_sat', a, 0, b), b), 245 246 # sudot_4x8_iadd is not commutative at all, so the patterns must be 247 # duplicated with zeros on each of the first positions. 248 (('sudot_4x8_iadd', a, 0, b), b), 249 (('sudot_4x8_iadd', 0, a, b), b), 250 (('sudot_4x8_iadd_sat', a, 0, b), b), 251 (('sudot_4x8_iadd_sat', 0, a, b), b), 252 253 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 254 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 255 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 256 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 257 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 258 259 # Try to let constant folding eliminate the dot-product part. These are 260 # safe because the dot product cannot overflow 32 bits. 261 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 262 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 263 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 264 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 265 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 266 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 267 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 268 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 269 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 270 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 271 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 272 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 273 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 274 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 275 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 276 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 277 278 # Optimize open-coded fmulz. 279 # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 280 (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)), 281 ('fmulz', a, b), has_fmulz+' && !'+signed_zero_preserve_32), 282 (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 283 ('fmulz', a, b), has_fmulz+' && !'+signed_zero_preserve_32), 284 285 # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 286 (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c), 287 ('ffmaz', a, b, c), has_fmulz+' && !'+signed_zero_preserve_32), 288 (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 289 ('ffmaz', a, b, c), has_fmulz+' && !'+signed_zero_preserve_32), 290 291 # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b)) 292 (('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, b))), 293 ('fexp2', ('fmulz', a, b)), 294 has_fmulz+' && !'+signed_zero_inf_nan_preserve_32), 295] 296 297# Shorthand for the expansion of just the dot product part of the [iu]dp4a 298# instructions. 299sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 300 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 301 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 302 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 303udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 304 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 305 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 306 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 307sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 308 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 309 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 310 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 311sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 312 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 313udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 314 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 315 316optimizations.extend([ 317 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 318 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 319 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 320 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 321 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 322 323 # For the unsigned dot-product, the largest possible value 4*(255*255) = 324 # 0x3f804, so we don't have to worry about that intermediate result 325 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 326 # that is less than 0xfffc07fc, then the result cannot overflow ever. 327 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 328 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'), 329 330 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 331 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 332 # don't have to worry about that intermediate result overflowing or 333 # underflowing. 334 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'), 335 336 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'), 337 338 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 339 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 340]) 341 342# Float sizes 343for s in [16, 32, 64]: 344 optimizations.extend([ 345 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 346 347 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 348 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 349 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 350 351 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 352 # These are the same as the previous three rules, but it depends on 353 # 1-fsat(x) <=> fsat(1-x). See below. 354 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 355 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 356 357 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 358 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 359 360 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 361 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 362 363 # These two aren't flrp lowerings, but do appear in some shaders. 364 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 365 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 366 367 # 1 - ((1 - a) * (1 - b)) 368 # 1 - (1 - a - b + a*b) 369 # 1 - 1 + a + b - a*b 370 # a + b - a*b 371 # a + b*(1 - a) 372 # b*(1 - a) + 1*a 373 # flrp(b, 1, a) 374 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 375 ]) 376 377optimizations.extend([ 378 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 379 380 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 381 382 (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 383 (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 384 (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)'), 385 386 (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 387 (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 388 (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 389 (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'), 390 (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'), 391 (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 392 (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 393 (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 394 (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 395 (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 396 (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 397 (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'), 398 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 399 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 400 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 401 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 402 (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 403 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 404 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 405 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 406 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 407 (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 408 409 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 410 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 411 412 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 413 414 (('fdot4', a, 0.0), 0.0), 415 (('fdot3', a, 0.0), 0.0), 416 (('fdot2', a, 0.0), 0.0), 417 418 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 419 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 420 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 421 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 422 423 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 424 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 425 426 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 427 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 428 429 # Lower fdot to fsum when it is available 430 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 431 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 432 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 433 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 434 435 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 436 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 437 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 438 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 439 440 # (a * #b + #c) << #d 441 # ((a * #b) << #d) + (#c << #d) 442 # (a * (#b << #d)) + (#c << #d) 443 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 444 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 445 446 # (a * #b) << #c 447 # a * (#b << #c) 448 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 449]) 450 451# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 452# bits of the second source. These replacements must correctly handle the 453# case where (b % bitsize) + (c % bitsize) >= bitsize. 454for s in [8, 16, 32, 64]: 455 mask = s - 1 456 457 ishl = "ishl@{}".format(s) 458 ishr = "ishr@{}".format(s) 459 ushr = "ushr@{}".format(s) 460 461 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 462 463 optimizations.extend([ 464 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 465 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 466 467 # To get get -1 for large shifts of negative values, ishr must instead 468 # clamp the shift count to the maximum value. 469 ((ishr, (ishr, a, '#b'), '#c'), 470 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 471 ]) 472 473# Optimize a pattern of address calculation created by DXVK where the offset is 474# divided by 4 and then multipled by 4. This can be turned into an iand and the 475# additions before can be reassociated to CSE the iand instruction. 476 477for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 478 a_sz = 'a@{}'.format(size) 479 480 optimizations.extend([ 481 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 482 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 483 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 484 485 # This does not trivially work with ishr. 486 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 487 ]) 488 489optimizations.extend([ 490 (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 491 (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 492]) 493 494for log2 in range(1, 7): # powers of two from 2 to 64 495 v = 1 << log2 496 mask = 0xffffffff & ~(v - 1) 497 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 498 499 optimizations.extend([ 500 # Reassociate for improved CSE 501 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 502 ]) 503 504# To save space in the state tables, reduce to the set that is known to help. 505# Previously, this was range(1, 32). In addition, a couple rules inside the 506# loop are commented out. Revisit someday, probably after mesa/#2635 has some 507# resolution. 508for i in [1, 2, 16, 24]: 509 lo_mask = 0xffffffff >> i 510 hi_mask = (0xffffffff << i) & 0xffffffff 511 512 optimizations.extend([ 513 # This pattern seems to only help in the soft-fp64 code. 514 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 515# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 516# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 517 518 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 519 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 520# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 521 ]) 522 523optimizations.extend([ 524 # This is common for address calculations. Reassociating may enable the 525 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 526 # instruction or a constant offset field for in load / store instructions. 527 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 528 529 # (a + #b) * #c => (a * #c) + (#b * #c) 530 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 531 532 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 533 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 534 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 535 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 536 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 537 538 # Comparison simplifications 539 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 540 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 541 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 542 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 543 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 544 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 545 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 546 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 547 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 548 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 549 550 (('iand', ('feq', a, b), ('fneu', a, b)), False), 551 (('iand', ('flt', a, b), ('flt', b, a)), False), 552 (('iand', ('ieq', a, b), ('ine', a, b)), False), 553 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 554 (('iand', ('ult', a, b), ('ult', b, a)), False), 555 556 # This helps some shaders because, after some optimizations, they end up 557 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 558 # matching would be handled by CSE. 559 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 560 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 561 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 562 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 563 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 564 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 565 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 566 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 567 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 568 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 569 570 (('ieq', ('ineg', a), 0), ('ieq', a, 0)), 571 (('ine', ('ineg', a), 0), ('ine', a, 0)), 572 (('ieq', ('iabs', a), 0), ('ieq', a, 0)), 573 (('ine', ('iabs', a), 0), ('ine', a, 0)), 574 575 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 576 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 577 578 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 579 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 580 581 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 582 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 583 584 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 585 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 586 587 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 588 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 589 590 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 591 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 592 593 # 0.0 >= b2f(a) 594 # b2f(a) <= 0.0 595 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 596 # inot(a) 597 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 598 599 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 600 601 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 602 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 603 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 604 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 605 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 606 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 607 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 608 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 609 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 610 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 611 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 612 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 613 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 614 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 615 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 616 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 617 618 # -(b2f(a) + b2f(b)) < 0 619 # 0 < b2f(a) + b2f(b) 620 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 621 # a || b 622 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 623 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 624 625 # -(b2f(a) + b2f(b)) >= 0 626 # 0 >= b2f(a) + b2f(b) 627 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 628 # !(a || b) 629 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 630 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 631 632 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 633 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 634 635 # Some optimizations (below) convert things like (a < b || c < b) into 636 # (min(a, c) < b). However, this interfers with the previous optimizations 637 # that try to remove comparisons with negated sums of b2f. This just 638 # breaks that apart. 639 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 640 ('ior', ('flt', c, 0.0), ('ior', a, b))), 641 642 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 643 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 644 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 645 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 646 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 647 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 648 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 649 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 650 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 651 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 652 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 653 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 654 655 # Cannot remove the addition from ilt or ige due to overflow. 656 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 657 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 658 659 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 660 (('fneu', ('b2f', 'a@1'), 0.0), a), 661 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 662 (('ine', ('b2i', 'a@1'), 0), a), 663 (('ieq', 'a@1', False), ('inot', a)), 664 (('ieq', 'a@1', True), a), 665 (('ine', 'a@1', False), a), 666 (('ine', 'a@1', True), ('inot', a)), 667 668 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 669 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 670 (('fge', ('u2f', a), 0.0), True), 671 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 672 (('flt', ('u2f', a), 0.0), False), 673 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 674 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 675 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 676 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 677 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 678 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 679 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 680 681 # 0.0 < fabs(a) 682 # fabs(a) > 0.0 683 # fabs(a) != 0.0 because fabs(a) must be >= 0 684 # a != 0.0 685 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 686 687 # -fabs(a) < 0.0 688 # fabs(a) > 0.0 689 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 690 691 # 0.0 >= fabs(a) 692 # 0.0 == fabs(a) because fabs(a) must be >= 0 693 # 0.0 == a 694 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 695 696 # -fabs(a) >= 0.0 697 # 0.0 >= fabs(a) 698 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 699 700 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 701 # 702 # This should be NaN safe. 703 # 704 # NaN >= 0 && 1 >= NaN -> false && false -> false 705 # 706 # vs. 707 # 708 # NaN == fsat(NaN) -> NaN == 0 -> false 709 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 710 711 # Note: fmin(-a, -b) == -fmax(a, b) 712 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 713 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 714 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 715 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 716 717 # fmin(b2f(a), b) 718 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 719 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 720 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 721 # 722 # Since b is a constant, constant folding will eliminate the fmin and the 723 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 724 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 725 726 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 727 728 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 729 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 730 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 731 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 732 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 733 (('bcsel', ('ult', b, a), b, a), ('umin', a, b)), 734 (('bcsel', ('ult', a, b), b, a), ('umax', a, b)), 735 (('bcsel', ('uge', a, b), b, a), ('umin', a, b)), 736 (('bcsel', ('uge', b, a), b, a), ('umax', a, b)), 737 (('bcsel', ('ilt', b, a), b, a), ('imin', a, b)), 738 (('bcsel', ('ilt', a, b), b, a), ('imax', a, b)), 739 (('bcsel', ('ige', a, b), b, a), ('imin', a, b)), 740 (('bcsel', ('ige', b, a), b, a), ('imax', a, b)), 741 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 742 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 743 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 744 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 745 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 746 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 747 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 748 (('bcsel', a, True, b), ('ior', a, b)), 749 (('bcsel', a, a, b), ('ior', a, b)), 750 (('bcsel', a, b, False), ('iand', a, b)), 751 (('bcsel', a, b, a), ('iand', a, b)), 752 (('~fmin', a, a), a), 753 (('~fmax', a, a), a), 754 (('imin', a, a), a), 755 (('imax', a, a), a), 756 (('umin', a, a), a), 757 (('umin', a, 0), 0), 758 (('umin', a, -1), a), 759 (('umax', a, a), a), 760 (('umax', a, 0), a), 761 (('umax', a, -1), -1), 762 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 763 (('umax', ('umax', a, b), b), ('umax', a, b)), 764 (('imax', ('imax', a, b), b), ('imax', a, b)), 765 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 766 (('umin', ('umin', a, b), b), ('umin', a, b)), 767 (('imin', ('imin', a, b), b), ('imin', a, b)), 768 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 769 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 770 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 771 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 772 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 773 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 774]) 775 776for N in [8, 16, 32, 64]: 777 b2iN = 'b2i{0}'.format(N) 778 optimizations.extend([ 779 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 780 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 781 ]) 782 783for N in [16, 32, 64]: 784 b2fN = 'b2f{0}'.format(N) 785 optimizations.extend([ 786 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 787 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 788 ]) 789 790# Integer sizes 791for s in [8, 16, 32, 64]: 792 optimizations.extend([ 793 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 794 795 # Simplify logic to detect sign of an integer. 796 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 797 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 798 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 799 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 800 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 801 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 802 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 803 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 804 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 805 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 806 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 807 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 808 ]) 809 810optimizations.extend([ 811 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 812 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 813 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 814 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 815 (('~fmin', a, ('fabs', a)), a), 816 (('imin', a, ('iabs', a)), a), 817 (('~fmax', a, ('fneg', ('fabs', a))), a), 818 (('imax', a, ('ineg', ('iabs', a))), a), 819 (('fmax', a, ('fabs', a)), ('fabs', a)), 820 (('imax', a, ('iabs', a)), ('iabs', a)), 821 (('fmax', a, ('fneg', a)), ('fabs', a)), 822 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 823 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 824 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 825 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 826 # fsat(a) returns 0.0. 827 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 828 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 829 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 830 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 831 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 832 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 833 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 834 # representing this in the optimizations other than the usual ~. 835 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 836 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 837 # the new comparison precise to prevent it being changed to 'a != 0'. 838 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 839 (('fsat', ('b2f', a)), ('b2f', a)), 840 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 841 (('fsat', ('fsat', a)), ('fsat', a)), 842 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 843 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 844 (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_preserve_32), 845 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 846 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 847 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 848 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 849 # Both the left and right patterns are "b" when isnan(a), so this is exact. 850 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 851 (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))), 852 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 853 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 854 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 855 856 # max(-min(b, a), b) -> max(abs(b), -a) 857 # min(-max(b, a), b) -> min(-abs(b), -a) 858 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 859 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 860 861 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 862 # fsat(b-a). 863 # 864 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 865 # 866 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 867 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 868 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 869 870 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 871 872 # The ior versions are exact because fmin and fmax will always pick a 873 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 874 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 875 # to prevent other optimizations from ruining the "NaN clensing" property 876 # of the fmin or fmax. 877 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 878 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 879 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 880 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 881 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 882 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 883 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 884 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 885 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 886 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 887 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 888 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 889 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 890 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 891 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 892 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 893 894 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 895 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 896 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 897 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 898 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 899 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 900 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 901 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 902 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 903 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 904 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 905 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 906 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 907 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 908 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 909 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 910 911 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 912 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 913 # single step. Doing just the replacement can lead to an infinite loop as 914 # the pattern is repeatedly applied to the result of the previous 915 # application of the pattern. 916 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 917 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 918 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 919 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 920 921 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 922 # numbers, then it can be replaced with fneu. 923 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 924 925 # Other patterns may optimize the resulting iand tree further. 926 (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)), 927 ('iand', ('iand', a, b), ('iand', c, b))), 928]) 929 930# Float sizes 931for s in [16, 32, 64]: 932 if s == 64: 933 match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)" 934 else: 935 match_fsign_cond = "!options->lower_fsign" 936 optimizations.extend([ 937 # These derive from the previous patterns with the application of b < 0 <=> 938 # 0 < -b. The transformation should be applied if either comparison is 939 # used once as this ensures that the number of comparisons will not 940 # increase. The sources to the ior and iand are not symmetric, so the 941 # rules have to be duplicated to get this behavior. 942 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 943 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 944 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 945 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 946 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 947 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 948 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 949 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 950 951 (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 952 (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 953 (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 954 (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 955 956 # The (i2f32, ...) part is an open-coded fsign. When that is combined 957 # with the bcsel, it's basically copysign(1.0, a). There are some 958 # behavior differences between this pattern and copysign w.r.t. ±0 and 959 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 960 # to x, regardless of whether either or both values are NaN. 961 # 962 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 963 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 964 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 965 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 966 # 967 # For all other values of 'a', the original and replacement behave as 968 # copysign. 969 # 970 # Marking the replacement comparisons as precise prevents any future 971 # optimizations from replacing either of the comparisons with the 972 # logical-not of the other. 973 # 974 # Note: Use b2i32 in the replacement because some platforms that 975 # support fp16 don't support int16. 976 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 977 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 978 979 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 980 981 # The C spec says, "If the value of the integral part cannot be represented 982 # by the integer type, the behavior is undefined." "Undefined" can mean 983 # "the conversion doesn't happen at all." 984 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 985 986 # Ironically, mark these as imprecise because removing the conversions may 987 # preserve more precision than doing the conversions (e.g., 988 # uint(float(0x81818181u)) == 0x81818200). 989 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 990 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 991 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 992 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 993 994 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond), 995 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond), 996 997 # float? -> float? -> floatS ==> float? -> floatS 998 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 999 1000 # int? -> float? -> floatS ==> int? -> floatS 1001 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 1002 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 1003 1004 # float? -> float? -> intS ==> float? -> intS 1005 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 1006 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 1007 1008 # HLSL's sign function returns an integer 1009 (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)), 1010 ]) 1011 1012 for B in [32, 64]: 1013 if s < B: 1014 optimizations.extend([ 1015 # S = smaller, B = bigger 1016 # floatS -> floatB -> floatS ==> identity 1017 (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 1018 1019 # floatS -> floatB -> intB ==> floatS -> intB 1020 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 1021 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 1022 1023 # int? -> floatB -> floatS ==> int? -> floatS 1024 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 1025 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 1026 ]) 1027 1028for S in [1, 8, 16, 32]: 1029 for B in [8, 16, 32, 64]: 1030 if B <= S: 1031 continue 1032 optimizations.extend([ 1033 # intS -> intB -> intS ==> identity 1034 (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a), 1035 (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a), 1036 ]) 1037 1038 if B < 16: 1039 continue 1040 for C in [8, 16, 32, 64]: 1041 if C <= S: 1042 continue 1043 optimizations.extend([ 1044 # intS -> intC -> floatB ==> intS -> floatB 1045 (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)), 1046 (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)), 1047 ]) 1048 1049# mediump variants of the above 1050optimizations.extend([ 1051 # int32 -> float32 -> float16 ==> int32 -> float16 1052 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 1053 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 1054 1055 # float32 -> float16 -> int16 ==> float32 -> int16 1056 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 1057 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 1058 1059 # float32 -> int32 -> int16 ==> float32 -> int16 1060 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 1061 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 1062 1063 # int32 -> int16 -> float16 ==> int32 -> float16 1064 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 1065 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 1066]) 1067 1068# Clean up junk left from 8-bit integer to 16-bit integer lowering. 1069optimizations.extend([ 1070 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1071 # accomplished by mask the upper 8-bit of the immediate operand to the 1072 # iand instruction. Often times, both patterns will end up being applied 1073 # to the same original expression tree. 1074 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1075 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1076]) 1077 1078for op in ['iand', 'ior', 'ixor']: 1079 optimizations.extend([ 1080 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1081 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1082 1083 # Undistribute extract from a logic op 1084 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1085 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1086 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1087 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1088 1089 # Undistribute shifts from a logic op 1090 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1091 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1092 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1093 ]) 1094 1095# Integer sizes 1096for s in [8, 16, 32, 64]: 1097 last_shift_bit = int(math.log2(s)) - 1 1098 1099 optimizations.extend([ 1100 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'), 1101 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'), 1102 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'), 1103 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'), 1104 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'), 1105 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'), 1106 1107 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1108 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1109 1110 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1111 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1112 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1113 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1114 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))), 1115 ]) 1116 1117optimizations.extend([ 1118 # Common pattern like 'if (i == 0 || i == 1 || ...)' 1119 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1120 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1121 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1122 (('ior', a, ('ieq', a, False)), True), 1123 1124 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1125 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1126 1127 (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'), 1128 ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)), 1129 ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))), 1130 ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b))) 1131 ) 1132 ), 1133 1134 (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), 1135 ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'), 1136 1137 (('ior', ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)), 1138 (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)), 1139 1140 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1141 # The first part of the iand comes from the !__feq64_nonnan. 1142 # 1143 # The second pattern is a reformulation of the first based on the relation 1144 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1145 # happens to be y == 0. 1146 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1147 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1148 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1149 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1150 1151 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1152 # transformations occur before constant propagation and loop-unrolling. 1153 # 1154 # The flt versions are exact. If isnan(a), the original pattern is 1155 # trivially false, and the replacements are false too. If isnan(b): 1156 # 1157 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1158 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1159 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1160 (('~fge', a, ('fmin', b, a)), True), 1161 (('~fge', ('fmax', a, b), a), True), 1162 (('flt', a, ('fmin', b, a)), False), 1163 (('flt', ('fmax', a, b), a), False), 1164 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1165 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1166 1167 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1168 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1169 (('ige', a, ('imin', b, a)), True), 1170 (('ige', ('imax', a, b), a), True), 1171 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1172 (('ult', ('umin', a, b), a), ('ult', b, a)), 1173 (('uge', a, ('umin', b, a)), True), 1174 (('uge', ('umax', a, b), a), True), 1175 (('ilt', a, ('imin', b, a)), False), 1176 (('ilt', ('imax', a, b), a), False), 1177 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1178 (('ige', ('imin', a, b), a), ('ige', b, a)), 1179 (('ult', a, ('umin', b, a)), False), 1180 (('ult', ('umax', a, b), a), False), 1181 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1182 (('uge', ('umin', a, b), a), ('uge', b, a)), 1183 (('ult', a, ('iand', b, a)), False), 1184 (('ult', ('ior', a, b), a), False), 1185 (('uge', a, ('iand', b, a)), True), 1186 (('uge', ('ior', a, b), a), True), 1187 1188 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1189 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1190 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1191 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1192 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1193 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1194 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1195 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1196 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1197 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1198 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1199 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1200 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1201 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1202 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1203 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1204 1205 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1206 # negative. 1207 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1208 ('iabs', ('ishr', a, b))), 1209 (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 1210 1211 (('fabs', ('slt', a, b)), ('slt', a, b)), 1212 (('fabs', ('sge', a, b)), ('sge', a, b)), 1213 (('fabs', ('seq', a, b)), ('seq', a, b)), 1214 (('fabs', ('sne', a, b)), ('sne', a, b)), 1215 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1216 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1217 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1218 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1219 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1220 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1221 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1222 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1223 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1224 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1225 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1226 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1227 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1228 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1229 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1230 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1231 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1232 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1233 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1234 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1235 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1236 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1237 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1238 (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'), 1239 (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'), 1240 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1241 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1242 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1243 (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1244 (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1245]) 1246 1247def vector_cmp(reduce_op, cmp_op, comps): 1248 if len(comps) == 1: 1249 return (cmp_op, 'a.' + comps[0], 'b.' + comps[0]) 1250 else: 1251 mid = len(comps) // 2 1252 return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]), 1253 vector_cmp(reduce_op, cmp_op, comps[mid:])) 1254 1255for op in [ 1256 ('ball_iequal', 'ieq', 'iand'), 1257 ('ball_fequal', 'feq', 'iand'), 1258 ('bany_inequal', 'ine', 'ior'), 1259 ('bany_fnequal', 'fneu', 'ior'), 1260]: 1261 optimizations.extend([ 1262 ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'), 1263 ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'), 1264 ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'), 1265 ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'), 1266 ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'), 1267 ]) 1268 1269optimizations.extend([ 1270 (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1271 (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1272 (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1273 (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1274 (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1275 (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1276 (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1277 (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1278 (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1279 (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1280 (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1281 (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1282 (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1283 (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1284 (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1285 (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1286 1287 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1288 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1289 # Emulating booleans 1290 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1291 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1292 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1293 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1294 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1295 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1296 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1297 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1298 # Comparison with the same args. Note that these are only done for the 1299 # float versions when the source must be a number. Generally, NaN cmp NaN 1300 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1301 # is false, and, for any number X, X < X is also false. 1302 (('ilt', a, a), False), 1303 (('ige', a, a), True), 1304 (('ieq', a, a), True), 1305 (('ine', a, a), False), 1306 (('ult', a, a), False), 1307 (('uge', a, a), True), 1308 (('flt', a, a), False), 1309 (('fge', 'a(is_a_number)', a), True), 1310 (('feq', 'a(is_a_number)', a), True), 1311 (('fneu', 'a(is_a_number)', a), False), 1312 # Logical and bit operations 1313 (('iand', a, a), a), 1314 (('iand', a, 0), 0), 1315 (('iand', a, -1), a), 1316 (('iand', a, ('inot', a)), 0), 1317 (('ior', a, a), a), 1318 (('ior', a, 0), a), 1319 (('ior', a, -1), -1), 1320 (('ior', a, ('inot', a)), -1), 1321 (('ixor', a, a), 0), 1322 (('ixor', a, 0), a), 1323 (('ixor', a, ('ixor', a, b)), b), 1324 (('ixor', a, -1), ('inot', a)), 1325 (('inot', ('inot', a)), a), 1326 (('ior', ('iand', a, b), b), b), 1327 (('ior', ('ior', a, b), b), ('ior', a, b)), 1328 (('iand', ('ior', a, b), b), b), 1329 (('iand', ('iand', a, b), b), ('iand', a, b)), 1330 1331 # It is common for sequences of (x & 1) to occur in large trees. Replacing 1332 # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "& 1333 # 1" to eventually bubble up to the top of the tree. 1334 (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), 1335 ('iand', a, ('iand', b, c))), 1336 1337 (('iand@64', a, '#b(is_lower_half_zero)'), 1338 ('pack_64_2x32_split', 0, 1339 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1340 '!options->lower_pack_64_2x32_split'), 1341 (('iand@64', a, '#b(is_upper_half_zero)'), 1342 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1343 0), 1344 '!options->lower_pack_64_2x32_split'), 1345 (('iand@64', a, '#b(is_lower_half_negative_one)'), 1346 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1347 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1348 '!options->lower_pack_64_2x32_split'), 1349 (('iand@64', a, '#b(is_upper_half_negative_one)'), 1350 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1351 ('unpack_64_2x32_split_y', a)), 1352 '!options->lower_pack_64_2x32_split'), 1353 1354 (('ior@64', a, '#b(is_lower_half_zero)'), 1355 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1356 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1357 '!options->lower_pack_64_2x32_split'), 1358 (('ior@64', a, '#b(is_upper_half_zero)'), 1359 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1360 ('unpack_64_2x32_split_y', a)), 1361 '!options->lower_pack_64_2x32_split'), 1362 (('ior@64', a, '#b(is_lower_half_negative_one)'), 1363 ('pack_64_2x32_split', -1, 1364 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1365 '!options->lower_pack_64_2x32_split'), 1366 (('ior@64', a, '#b(is_upper_half_negative_one)'), 1367 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1368 -1), 1369 '!options->lower_pack_64_2x32_split'), 1370 1371 (('ixor@64', a, '#b(is_lower_half_zero)'), 1372 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1373 ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1374 '!options->lower_pack_64_2x32_split'), 1375 (('ixor@64', a, '#b(is_upper_half_zero)'), 1376 ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1377 ('unpack_64_2x32_split_y', a)), 1378 '!options->lower_pack_64_2x32_split'), 1379 1380 # DeMorgan's Laws 1381 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1382 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1383 # Shift optimizations 1384 (('ishl', 0, a), 0), 1385 (('ishl', a, 0), a), 1386 (('ishr', 0, a), 0), 1387 (('ishr', -1, a), -1), 1388 (('ishr', a, 0), a), 1389 (('ushr', 0, a), 0), 1390 (('ushr', a, 0), a), 1391 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'), 1392 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'), 1393 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'), 1394 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'), 1395 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'), 1396 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'), 1397 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'), 1398 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'), 1399 (('urol@8', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 8, b))), '!options->has_rotate8'), 1400 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'), 1401 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'), 1402 (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))), 1403 (('uror@8', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 8, b))), '!options->has_rotate8'), 1404 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'), 1405 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'), 1406 (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))), 1407 1408 # bfi(X, a, b) = (b & ~X) | (a & X) 1409 # If X = ~0: (b & 0) | (a & 0xffffffff) = a 1410 # If X = 0: (b & 0xffffffff) | (a & 0) = b 1411 (('bfi', 0xffffffff, a, b), a), 1412 (('bfi', 0x00000000, a, b), b), 1413 1414 # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the 1415 # bfi is either b or c. 1416 (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)), 1417 1418 # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a) 1419 # = (a & a) | (b & ~a) If a is odd, find_lsb(a) == 0 1420 # = a | (b & ~a) 1421 # = a | b 1422 (('bfi', '#a(is_odd)', a, b), ('ior', a, b)), 1423 1424 # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a) 1425 # = ((b << find_lsb(a)) & a) 1426 # = (b & a) If a is odd, find_lsb(a) == 0 1427 (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)), 1428 1429 # Because 'a' is a positive power of two, the result of the bfi is either 0 1430 # or 'a' depending on whether or not 'b' is odd. Use 'b&1' for the zero 1431 # value to help platforms that can't have two constants in a bcsel. 1432 (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1433 ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))), 1434 (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1435 ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))), 1436 1437 # Exponential/logarithmic identities 1438 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1439 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1440 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1441 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1442 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1443 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1444 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1445 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1446 (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)), 1447 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1448 (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)), 1449 (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))), 1450 (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))), 1451 (('~fpow', a, 1.0), a), 1452 (('~fpow', a, 2.0), ('fmul', a, a)), 1453 (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)), 1454 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1455 (('~fpow', 2.0, a), ('fexp2', a)), 1456 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1457 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1458 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1459 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1460 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1461 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1462 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1463 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1464 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1465 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1466 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1467 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1468 (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1469 # Division and reciprocal 1470 (('~fdiv', 1.0, a), ('frcp', a)), 1471 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1472 (('~frcp', ('frcp', a)), a), 1473 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1474 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1475 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1476 # Trig 1477 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1478 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1479 # Boolean simplifications 1480 (('ieq', a, True), a), 1481 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1482 (('ine', a, False), a), 1483 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1484 (('bcsel', a, True, False), a), 1485 (('bcsel', a, False, True), ('inot', a)), 1486 (('bcsel', True, b, c), b), 1487 (('bcsel', False, b, c), c), 1488 1489 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1490 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1491 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1492 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1493 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1494 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1495 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1496 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1497 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1498 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1499 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1500 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1501 1502 (('bcsel', a, b, b), b), 1503 (('~fcsel', a, b, b), b), 1504 1505 # D3D Boolean emulation 1506 (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 1507 (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 1508 (('bcsel', a, 1, 0), ('b2i', 'a@1')), 1509 (('bcsel', a, 0, 1), ('b2i', ('inot', a))), 1510 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1511 ('ineg', ('b2i', ('iand', a, b)))), 1512 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1513 ('ineg', ('b2i', ('ior', a, b)))), 1514 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1515 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1516 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1517 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1518 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1519 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1520 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1521 1522 # With D3D booleans, imax is AND and umax is OR 1523 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1524 ('ineg', ('b2i', ('iand', a, b)))), 1525 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1526 ('ineg', ('b2i', ('ior', a, b)))), 1527 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1528 ('ineg', ('b2i', ('ior', a, b)))), 1529 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1530 ('ineg', ('b2i', ('iand', a, b)))), 1531 (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1532 (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1533 1534 # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op. 1535 (('iand', ('b2i', a), 1), ('b2i', a)), 1536 1537 (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))), 1538 (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))), 1539 1540 # Conversions 1541 (('f2i', ('ftrunc', a)), ('f2i', a)), 1542 (('f2u', ('ftrunc', a)), ('f2u', a)), 1543 1544 # Conversions from 16 bits to 32 bits and back can always be removed 1545 (('f2fmp', ('f2f32', 'a@16')), a), 1546 (('i2imp', ('i2i32', 'a@16')), a), 1547 (('i2imp', ('u2u32', 'a@16')), a), 1548 1549 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1550 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1551 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1552 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1553 1554 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1555 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1556 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1557 1558 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1559 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1560 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1561 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1562 1563 # Conversions to 16 bits would be lossy so they should only be removed if 1564 # the instruction was generated by the precision lowering pass. 1565 (('f2f32', ('f2fmp', 'a@32')), a), 1566 (('i2i32', ('i2imp', 'a@32')), a), 1567 (('u2u32', ('i2imp', 'a@32')), a), 1568 1569 # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32 1570 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1571 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1572 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1573 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1574 1575 # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32 1576 (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)), 1577 (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)), 1578 (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)), 1579 1580 (('ffloor', 'a(is_integral)'), a), 1581 (('fceil', 'a(is_integral)'), a), 1582 (('ftrunc', 'a(is_integral)'), a), 1583 (('fround_even', 'a(is_integral)'), a), 1584 1585 # fract(x) = x - floor(x), so fract(NaN) = NaN 1586 (('~ffract', 'a(is_integral)'), 0.0), 1587 (('fabs', 'a(is_not_negative)'), a), 1588 (('iabs', 'a(is_not_negative)'), a), 1589 (('fsat', 'a(is_not_positive)'), 0.0), 1590 1591 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1592 1593 # The result of the multiply must be in [-1, 0], so the result of the ffma 1594 # must be in [0, 1]. 1595 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1596 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1597 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1598 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1599 1600 (('fneu', 'a(is_not_zero)', 0.0), True), 1601 (('feq', 'a(is_not_zero)', 0.0), False), 1602 1603 # In this chart, + means value > 0 and - means value < 0. 1604 # 1605 # + >= + -> unknown 0 >= + -> false - >= + -> false 1606 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1607 # + >= - -> true 0 >= - -> true - >= - -> unknown 1608 # 1609 # Using grouping conceptually similar to a Karnaugh map... 1610 # 1611 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1612 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1613 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1614 # 1615 # The flt / ilt cases just invert the expected result. 1616 # 1617 # The results expecting true, must be marked imprecise. The results 1618 # expecting false are fine because NaN compared >= or < anything is false. 1619 1620 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1621 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1622 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1623 1624 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1625 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1626 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1627 1628 (('ine', 'a(is_not_zero)', 0), True), 1629 (('ieq', 'a(is_not_zero)', 0), False), 1630 1631 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1632 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1633 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1634 1635 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1636 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1637 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1638 1639 (('ult', 0, 'a(is_gt_zero)'), True), 1640 (('ult', a, 0), False), 1641 1642 # Packing and then unpacking does nothing 1643 (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 1644 (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 1645 (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'), 1646 (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'), 1647 (('unpack_64_2x32_split_x', ('u2u64', 'a@32')), a), 1648 (('unpack_64_2x32_split_y', ('u2u64', a)), 0), 1649 (('unpack_64_2x32_split_x', ('i2i64', 'a@32')), a), 1650 (('unpack_64_2x32_split_y', ('i2i64(is_used_once)', 'a@32')), ('ishr', a, 31)), 1651 (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)), 1652 (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1653 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1654 (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1655 ('unpack_64_2x32_split_y', a)), a), 1656 (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a), 1657 ('unpack_64_2x32_split_y', a))), a), 1658 (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1659 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1660 1661 (('unpack_64_4x16', ('pack_64_4x16', a)), a), 1662 (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)), 1663 (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)), 1664 1665 # Comparing two halves of an unpack separately. While this optimization 1666 # should be correct for non-constant values, it's less obvious that it's 1667 # useful in that case. For constant values, the pack will fold and we're 1668 # guaranteed to reduce the whole tree to one instruction. 1669 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1670 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1671 ('ieq', a, ('pack_32_2x16_split', b, c))), 1672 1673 # Byte extraction 1674 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1675 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1676 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1677 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1678 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1679 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1680 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1681 1682 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1683 # storage buffer. 1684 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1685 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1686 1687 # Common pattern after lowering 8-bit integers to 16-bit. 1688 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1689 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1690 1691 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1692 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1693 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1694 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1695 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1696 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1697 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1698 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1699 1700 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1701 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1702 1703 # Word extraction 1704 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1705 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1706 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1707 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1708 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1709 1710 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1711 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1712 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1713 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1714 1715 # Packing a u8vec4 to write to an SSBO. 1716 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1717 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1718 1719 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1720 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1721 1722 # Lower pack/unpack 1723 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1724 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'), 1725 (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'), 1726 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1727 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1728 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1729 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1730 1731 (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1732 (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1733 1734 # Useless masking before unpacking 1735 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1736 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1737 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1738 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1739 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1740 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1741 1742 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1743 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1744 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1745 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1746 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1747 1748 # Optimize half packing 1749 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1750 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1751 1752 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1753 ('pack_half_2x16', ('vec2', a, b))), 1754 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1755 ('pack_half_2x16', ('vec2', a, b))), 1756 1757 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1758 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 1759 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 1760 1761 (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)), 1762 (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)), 1763 (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)), 1764 1765 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1766 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1767 1768 (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 1769 (('ior', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 1770 1771 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 1772 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 1773 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 1774 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 1775 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 1776 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 1777 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 1778 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 1779 1780 # Reduce intermediate precision with int64. 1781 (('u2u32', ('iadd(is_used_once)', 'a@64', b)), 1782 ('iadd', ('u2u32', a), ('u2u32', b))), 1783]) 1784 1785# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 1786# patterns like those below. 1787for op in ('ushr', 'ishr'): 1788 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 1789 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 1790 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 1791 1792optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 1793 1794# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 1795# patterns like those below. 1796for op in ('extract_u8', 'extract_i8'): 1797 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 1798 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 1799 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 1800 1801optimizations.extend([ 1802 # Subtracts 1803 (('ussub_4x8_vc4', a, 0), a), 1804 (('ussub_4x8_vc4', a, ~0), 0), 1805 # Lower all Subtractions first - they can get recombined later 1806 (('fsub', a, b), ('fadd', a, ('fneg', b))), 1807 (('isub', a, b), ('iadd', a, ('ineg', b))), 1808 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1809 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 1810 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1811 (('bitz', a, b), ('inot', ('bitnz', a, b))), 1812 1813 # Propagate negation up multiplication chains 1814 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 1815 (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_preserve_32), 1816 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 1817 (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 1818 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 1819 1820 # Propagate constants up multiplication chains 1821 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 1822 (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 1823 (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 1824 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 1825 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 1826 (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 1827 (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 1828 # Prefer moving out a multiplication for more MAD/FMA-friendly code 1829 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 1830 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 1831 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 1832 (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 1833 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 1834 1835 # Reassociate constants in add/mul chains so they can be folded together. 1836 # For now, we mostly only handle cases where the constants are separated by 1837 # a single non-constant. We could do better eventually. 1838 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 1839 (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 1840 (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 1841 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 1842 (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 1843 (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 1844 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 1845 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 1846 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 1847 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 1848 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1849 (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 1850 (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1851 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 1852 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 1853 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 1854 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 1855 1856 # Reassociate add chains for more MAD/FMA-friendly code 1857 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 1858 1859 # Drop mul-div by the same value when there's no wrapping. 1860 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 1861 1862 # By definition... 1863 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 1864 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1865 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1866 (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 1867 (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 1868 1869 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 1870 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1871 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1872 (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 1873 (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 1874 1875 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 1876 (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 1877 1878 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 1879 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1880 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 1881 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1882 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 1883 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1884 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 1885 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1886 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1887 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1888 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1889 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1890 1891 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'), 1892 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 1893 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 1894 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 1895 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'), 1896 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 1897 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 1898 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 1899 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 1900 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 1901 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 1902 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 1903 1904 # This is safe. Both ufind_msb_rev and bitfield_reverse can only have 1905 # 32-bit sources, so the transformation can only generate correct NIR. 1906 (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 1907 (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'), 1908 1909 (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))), 1910 1911 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1912 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1913 (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1914 (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1915 (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)), 1916 (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)), 1917 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 1918 1919 (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 1920 (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 1921 1922 # Misc. lowering 1923 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 1924 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 1925 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 1926 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 1927 1928 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1929 ('bcsel', ('ult', 31, 'bits'), 'insert', 1930 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 1931 'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'), 1932 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1933 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1934 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1935 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1936 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1937 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1938 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1939 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1940 1941 (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 1942 (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 1943 1944 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1945 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 1946 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 1947 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 1948 1949 # int64_t sum = a + b; 1950 # 1951 # if (a < 0 && b < 0 && a < sum) 1952 # sum = INT64_MIN; 1953 # } else if (a >= 0 && b >= 0 && sum < a) 1954 # sum = INT64_MAX; 1955 # } 1956 # 1957 # A couple optimizations are applied. 1958 # 1959 # 1. a < sum => sum >= 0. This replacement works because it is known that 1960 # a < 0 and b < 0, so sum should also be < 0 unless there was 1961 # underflow. 1962 # 1963 # 2. sum < a => sum < 0. This replacement works because it is known that 1964 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 1965 # overflow. 1966 # 1967 # 3. Invert the second if-condition and swap the order of parameters for 1968 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 1969 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 1970 # 1971 # On Intel Gen11, this saves ~11 instructions. 1972 (('iadd_sat@64', a, b), ('bcsel', 1973 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1974 0x8000000000000000, 1975 ('bcsel', 1976 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1977 ('iadd', a, b), 1978 0x7fffffffffffffff)), 1979 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1980 1981 # int64_t sum = a - b; 1982 # 1983 # if (a < 0 && b >= 0 && a < sum) 1984 # sum = INT64_MIN; 1985 # } else if (a >= 0 && b < 0 && a >= sum) 1986 # sum = INT64_MAX; 1987 # } 1988 # 1989 # Optimizations similar to the iadd_sat case are applied here. 1990 (('isub_sat@64', a, b), ('bcsel', 1991 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1992 0x8000000000000000, 1993 ('bcsel', 1994 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1995 ('isub', a, b), 1996 0x7fffffffffffffff)), 1997 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1998 1999 # These are done here instead of in the backend because the int64 lowering 2000 # pass will make a mess of the patterns. The first patterns are 2001 # conditioned on nir_lower_minmax64 because it was not clear that it was 2002 # always an improvement on platforms that have real int64 support. No 2003 # shaders in shader-db hit this, so it was hard to say one way or the 2004 # other. 2005 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2006 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2007 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2008 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2009 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2010 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2011 2012 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2013 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2014 # 0u < uint(a) <=> uint(a) != 0u 2015 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2016 2017 # Alternative lowering that doesn't rely on bfi. 2018 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2019 ('bcsel', ('ult', 31, 'bits'), 2020 'insert', 2021 (('ior', 2022 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 2023 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 2024 'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'), 2025 2026 # Alternative lowering that uses bitfield_select. 2027 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2028 ('bcsel', ('ult', 31, 'bits'), 'insert', 2029 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 2030 'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'), 2031 2032 (('ibitfield_extract', 'value', 'offset', 'bits'), 2033 ('bcsel', ('ult', 31, 'bits'), 'value', 2034 ('ibfe', 'value', 'offset', 'bits')), 2035 'options->lower_bitfield_extract && options->has_bfe'), 2036 2037 (('ubitfield_extract', 'value', 'offset', 'bits'), 2038 ('bcsel', ('ult', 31, 'bits'), 'value', 2039 ('ubfe', 'value', 'offset', 'bits')), 2040 'options->lower_bitfield_extract && options->has_bfe'), 2041 2042 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 2043 (('bitfield_select', a, b, 0), ('iand', a, b)), 2044 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 2045 2046 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 2047 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 2048 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 2049 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 2050 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 2051 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 2052 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 2053 2054 # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such 2055 (('ult', a, ('umin', ('iand', a, b), c)), False), 2056 (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False), 2057 (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2058 ('ubfe', 'value', 'offset', 'width')), 2059 (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2060 ('ibfe', 'value', 'offset', 'width')), 2061 (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'), 2062 ('bfm', 'width', 'offset')), 2063 2064 # open-coded BFM 2065 (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'), 2066 (('ishl', ('bfm', a, 0), b), ('bfm', a, b)), 2067 2068 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2069 # 2070 # If bits is zero, the result will be zero. 2071 # 2072 # These patterns prevent other patterns from generating invalid results 2073 # when count is zero. 2074 (('ubfe', a, b, 0), 0), 2075 (('ibfe', a, b, 0), 0), 2076 2077 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 2078 2079 (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)), 2080 (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 2081 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2082 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2083 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2084 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2085 2086 (('ibitfield_extract', 'value', 'offset', 'bits'), 2087 ('bcsel', ('ieq', 0, 'bits'), 2088 0, 2089 ('ishr', 2090 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 2091 ('isub', 32, 'bits'))), 2092 'options->lower_bitfield_extract && !options->has_bfe'), 2093 2094 (('ubitfield_extract', 'value', 'offset', 'bits'), 2095 ('iand', 2096 ('ushr', 'value', 'offset'), 2097 ('bcsel', ('ieq', 'bits', 32), 2098 0xffffffff, 2099 ('isub', ('ishl', 1, 'bits'), 1))), 2100 'options->lower_bitfield_extract && !options->has_bfe'), 2101 2102 (('ifind_msb', 'value'), 2103 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 2104 'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'), 2105 2106 (('ifind_msb', 'value'), 2107 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 2108 ('isub', 31, ('ifind_msb_rev', 'value')), 2109 ('ifind_msb_rev', 'value')), 2110 'options->lower_ifind_msb && options->has_find_msb_rev'), 2111 2112 # uclz of an absolute value source almost always does the right thing. 2113 # There are a couple problem values: 2114 # 2115 # * 0x80000000. Since abs(0x80000000) == 0x80000000, uclz returns 0. 2116 # However, findMSB(int(0x80000000)) == 30. 2117 # 2118 # * 0xffffffff. Since abs(0xffffffff) == 1, uclz returns 31. Section 8.8 2119 # (Integer Functions) of the GLSL 4.50 spec says: 2120 # 2121 # For a value of zero or negative one, -1 will be returned. 2122 # 2123 # * Negative powers of two. uclz(abs(-(1<<x))) returns x, but 2124 # findMSB(-(1<<x)) should return x-1. 2125 # 2126 # For all negative number cases, including 0x80000000 and 0xffffffff, the 2127 # correct value is obtained from uclz if instead of negating the (already 2128 # negative) value the logical-not is used. A conditional logical-not can 2129 # be achieved by (x ^ (x >> 31)). 2130 (('ifind_msb', 'value'), 2131 ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))), 2132 'options->lower_ifind_msb && options->has_uclz'), 2133 2134 (('ufind_msb', 'value@32'), 2135 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 2136 ('isub', 31, ('ufind_msb_rev', 'value')), 2137 ('ufind_msb_rev', 'value')), 2138 'options->lower_ufind_msb && options->has_find_msb_rev'), 2139 2140 (('ufind_msb', 'value@32'), 2141 ('isub', 31, ('uclz', 'value')), 2142 'options->lower_ufind_msb && options->has_uclz'), 2143 2144 (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'), 2145 2146 (('find_lsb', 'value@64'), 2147 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 2148 'options->lower_find_lsb'), 2149 2150 (('find_lsb', 'value'), 2151 ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))), 2152 'options->lower_find_lsb'), 2153 2154 (('extract_i8', a, 'b@32'), 2155 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 2156 'options->lower_extract_byte'), 2157 2158 (('extract_u8', a, 'b@32'), 2159 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 2160 'options->lower_extract_byte'), 2161 2162 (('extract_i16', a, 'b@32'), 2163 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 2164 'options->lower_extract_word'), 2165 2166 (('extract_u16', a, 'b@32'), 2167 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 2168 'options->lower_extract_word'), 2169 2170 (('pack_unorm_2x16', 'v'), 2171 ('pack_uvec2_to_uint', 2172 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 2173 'options->lower_pack_unorm_2x16'), 2174 2175 (('pack_unorm_4x8', 'v'), 2176 ('pack_uvec4_to_uint', 2177 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2178 'options->lower_pack_unorm_4x8'), 2179 2180 (('pack_snorm_2x16', 'v'), 2181 ('pack_uvec2_to_uint', 2182 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 2183 'options->lower_pack_snorm_2x16'), 2184 2185 (('pack_snorm_4x8', 'v'), 2186 ('pack_uvec4_to_uint', 2187 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2188 'options->lower_pack_snorm_4x8'), 2189 2190 (('unpack_unorm_2x16', 'v'), 2191 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 2192 ('extract_u16', 'v', 1))), 2193 65535.0), 2194 'options->lower_unpack_unorm_2x16'), 2195 2196 (('unpack_unorm_4x8', 'v'), 2197 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 2198 ('extract_u8', 'v', 1), 2199 ('extract_u8', 'v', 2), 2200 ('extract_u8', 'v', 3))), 2201 255.0), 2202 'options->lower_unpack_unorm_4x8'), 2203 2204 (('unpack_snorm_2x16', 'v'), 2205 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 2206 ('extract_i16', 'v', 1))), 2207 32767.0))), 2208 'options->lower_unpack_snorm_2x16'), 2209 2210 (('unpack_snorm_4x8', 'v'), 2211 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 2212 ('extract_i8', 'v', 1), 2213 ('extract_i8', 'v', 2), 2214 ('extract_i8', 'v', 3))), 2215 127.0))), 2216 'options->lower_unpack_snorm_4x8'), 2217 2218 (('pack_half_2x16_split', 'a@32', 'b@32'), 2219 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 2220 'options->lower_pack_split'), 2221 2222 (('unpack_half_2x16_split_x', 'a@32'), 2223 ('f2f32', ('u2u16', a)), 2224 'options->lower_pack_split'), 2225 2226 (('unpack_half_2x16_split_y', 'a@32'), 2227 ('f2f32', ('u2u16', ('ushr', a, 16))), 2228 'options->lower_pack_split'), 2229 2230 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 2231 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 2232 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 2233 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 2234 # Mark the new comparisons precise to prevent them being changed to 'a != 2235 # 0' or 'a == 0'. 2236 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 2237 (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'), 2238 2239 # Address/offset calculations: 2240 # Drivers supporting imul24 should use the nir_lower_amul() pass, this 2241 # rule converts everyone else to imul: 2242 (('amul', a, b), ('imul', a, b), '!options->has_imul24'), 2243 2244 (('umul24', a, b), 2245 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 2246 '!options->has_umul24'), 2247 (('umad24', a, b, c), 2248 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 2249 '!options->has_umad24'), 2250 2251 # Relaxed 24bit ops 2252 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 2253 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 2254 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 2255 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 2256 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 2257 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 2258 2259 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 2260 (('imad24_ir3', a, 0, c), (c)), 2261 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 2262 2263 # if first two srcs are const, crack apart the imad so constant folding 2264 # can clean up the imul: 2265 # TODO ffma should probably get a similar rule: 2266 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 2267 2268 # These will turn 24b address/offset calc back into 32b shifts, but 2269 # it should be safe to get back some of the bits of precision that we 2270 # already decided were no necessary: 2271 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 2272 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 2273 (('imul24', a, 0), (0)), 2274]) 2275 2276for bit_size in [8, 16, 32, 64]: 2277 cond = '!options->lower_uadd_sat' 2278 if bit_size == 64: 2279 cond += ' && !(options->lower_int64_options & nir_lower_iadd64)' 2280 add = 'iadd@' + str(bit_size) 2281 2282 optimizations += [ 2283 (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2284 (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond), 2285 (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond), 2286 (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2287 ] 2288 2289for bit_size in [8, 16, 32, 64]: 2290 cond = '!options->lower_usub_sat' 2291 if bit_size == 64: 2292 cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)' 2293 add = 'iadd@' + str(bit_size) 2294 2295 optimizations += [ 2296 (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2297 (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2298 (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2299 (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2300 ] 2301 2302# bit_size dependent lowerings 2303for bit_size in [8, 16, 32, 64]: 2304 # convenience constants 2305 intmax = (1 << (bit_size - 1)) - 1 2306 intmin = 1 << (bit_size - 1) 2307 2308 optimizations += [ 2309 (('iadd_sat@' + str(bit_size), a, b), 2310 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2311 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2312 (('isub_sat@' + str(bit_size), a, b), 2313 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2314 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2315 ] 2316 2317invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2318 2319for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2320 optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 2321 ('iand', (invert[left], a, b), (invert[right], c, d)))) 2322 optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 2323 ('ior', (invert[left], a, b), (invert[right], c, d)))) 2324 2325# Optimize x2yN(b2x(x)) -> b2y 2326for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2327 if x != 'f' and y != 'f' and x != y: 2328 continue 2329 2330 b2x = 'b2f' if x == 'f' else 'b2i' 2331 b2y = 'b2f' if y == 'f' else 'b2i' 2332 x2yN = '{}2{}'.format(x, y) 2333 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2334 2335# Optimize away x2xN(a@N) 2336for t in ['int', 'uint', 'float', 'bool']: 2337 for N in type_sizes(t): 2338 x2xN = '{0}2{0}{1}'.format(t[0], N) 2339 aN = 'a@{0}'.format(N) 2340 optimizations.append(((x2xN, aN), a)) 2341 2342# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2343# In particular, we can optimize away everything except upcast of downcast and 2344# upcasts where the type differs from the other cast 2345for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2346 if N < M: 2347 # The outer cast is a down-cast. It doesn't matter what the size of the 2348 # argument of the inner cast is because we'll never been in the upcast 2349 # of downcast case. Regardless of types, we'll always end up with y2yN 2350 # in the end. 2351 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2352 x2xN = '{0}2{0}{1}'.format(x, N) 2353 y2yM = '{0}2{0}{1}'.format(y, M) 2354 y2yN = '{0}2{0}{1}'.format(y, N) 2355 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2356 elif N > M: 2357 # If the outer cast is an up-cast, we have to be more careful about the 2358 # size of the argument of the inner cast and with types. In this case, 2359 # the type is always the type of type up-cast which is given by the 2360 # outer cast. 2361 for P in type_sizes('uint'): 2362 # We can't optimize away up-cast of down-cast. 2363 if M < P: 2364 continue 2365 2366 # Because we're doing down-cast of down-cast, the types always have 2367 # to match between the two casts 2368 for x in ['i', 'u']: 2369 x2xN = '{0}2{0}{1}'.format(x, N) 2370 x2xM = '{0}2{0}{1}'.format(x, M) 2371 aP = 'a@{0}'.format(P) 2372 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2373 else: 2374 # The N == M case is handled by other optimizations 2375 pass 2376 2377# Downcast operations should be able to see through pack 2378for t in ['i', 'u']: 2379 for N in [8, 16, 32]: 2380 x2xN = '{0}2{0}{1}'.format(t, N) 2381 optimizations += [ 2382 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2383 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2384 ] 2385 2386# Optimize comparisons with up-casts 2387for t in ['int', 'uint', 'float']: 2388 for N, M in itertools.product(type_sizes(t), repeat=2): 2389 if N == 1 or N >= M: 2390 continue 2391 2392 cond = 'true' 2393 if N == 8: 2394 cond = 'options->support_8bit_alu' 2395 elif N == 16: 2396 cond = 'options->support_16bit_alu' 2397 x2xM = '{0}2{0}{1}'.format(t[0], M) 2398 x2xN = '{0}2{0}{1}'.format(t[0], N) 2399 aN = 'a@' + str(N) 2400 bN = 'b@' + str(N) 2401 xeq = 'feq' if t == 'float' else 'ieq' 2402 xne = 'fneu' if t == 'float' else 'ine' 2403 xge = '{0}ge'.format(t[0]) 2404 xlt = '{0}lt'.format(t[0]) 2405 2406 # Up-casts are lossless so for correctly signed comparisons of 2407 # up-casted values we can do the comparison at the largest of the two 2408 # original sizes and drop one or both of the casts. (We have 2409 # optimizations to drop the no-op casts which this may generate.) 2410 for P in type_sizes(t): 2411 if P == 1 or P > N: 2412 continue 2413 2414 bP = 'b@' + str(P) 2415 optimizations += [ 2416 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2417 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2418 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2419 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2420 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2421 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2422 ] 2423 2424 # The next bit doesn't work on floats because the range checks would 2425 # get way too complicated. 2426 if t in ['int', 'uint']: 2427 if t == 'int': 2428 xN_min = -(1 << (N - 1)) 2429 xN_max = (1 << (N - 1)) - 1 2430 elif t == 'uint': 2431 xN_min = 0 2432 xN_max = (1 << N) - 1 2433 else: 2434 assert False 2435 2436 # If we're up-casting and comparing to a constant, we can unfold 2437 # the comparison into a comparison with the shrunk down constant 2438 # and a check that the constant fits in the smaller bit size. 2439 optimizations += [ 2440 ((xeq, (x2xM, aN), '#b'), 2441 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2442 ((xne, (x2xM, aN), '#b'), 2443 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2444 ((xlt, (x2xM, aN), '#b'), 2445 ('iand', (xlt, xN_min, b), 2446 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2447 ((xlt, '#a', (x2xM, bN)), 2448 ('iand', (xlt, a, xN_max), 2449 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2450 ((xge, (x2xM, aN), '#b'), 2451 ('iand', (xge, xN_max, b), 2452 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2453 ((xge, '#a', (x2xM, bN)), 2454 ('iand', (xge, a, xN_min), 2455 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2456 ] 2457 2458# Convert masking followed by signed downcast to just unsigned downcast 2459optimizations += [ 2460 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2461 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2462 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2463 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2464 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2465 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2466] 2467 2468# Some operations such as iadd have the property that the bottom N bits of the 2469# output only depends on the bottom N bits of each of the inputs so we can 2470# remove casts 2471for N in [16, 32]: 2472 for M in [8, 16]: 2473 if M >= N: 2474 continue 2475 2476 aN = 'a@' + str(N) 2477 u2uM = 'u2u{0}'.format(M) 2478 i2iM = 'i2i{0}'.format(M) 2479 2480 for x in ['u', 'i']: 2481 x2xN = '{0}2{0}{1}'.format(x, N) 2482 extract_xM = 'extract_{0}{1}'.format(x, M) 2483 2484 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2485 extract_xM_M_bits = \ 2486 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2487 optimizations += [ 2488 ((x2xN_M_bits, (u2uM, aN)), a), 2489 ((extract_xM_M_bits, aN, 0), a), 2490 ] 2491 2492 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2493 optimizations += [ 2494 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2495 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2496 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2497 ] 2498 2499 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2500 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2501 optimizations += [ 2502 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2503 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2504 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2505 ] 2506 2507def fexp2i(exp, bits): 2508 # Generate an expression which constructs value 2.0^exp or 0.0. 2509 # 2510 # We assume that exp is already in a valid range: 2511 # 2512 # * [-15, 15] for 16-bit float 2513 # * [-127, 127] for 32-bit float 2514 # * [-1023, 1023] for 16-bit float 2515 # 2516 # If exp is the lowest value in the valid range, a value of 0.0 is 2517 # constructed. Otherwise, the value 2.0^exp is constructed. 2518 if bits == 16: 2519 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2520 elif bits == 32: 2521 return ('ishl', ('iadd', exp, 127), 23) 2522 elif bits == 64: 2523 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2524 else: 2525 assert False 2526 2527def ldexp(f, exp, bits): 2528 # The maximum possible range for a normal exponent is [-126, 127] and, 2529 # throwing in denormals, you get a maximum range of [-149, 127]. This 2530 # means that we can potentially have a swing of +-276. If you start with 2531 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2532 # all the way to zero. The GLSL spec only requires that we handle a subset 2533 # of this range. From version 4.60 of the spec: 2534 # 2535 # "If exp is greater than +128 (single-precision) or +1024 2536 # (double-precision), the value returned is undefined. If exp is less 2537 # than -126 (single-precision) or -1022 (double-precision), the value 2538 # returned may be flushed to zero. Additionally, splitting the value 2539 # into a significand and exponent using frexp() and then reconstructing 2540 # a floating-point value using ldexp() should yield the original input 2541 # for zero and all finite non-denormalized values." 2542 # 2543 # The SPIR-V spec has similar language. 2544 # 2545 # In order to handle the maximum value +128 using the fexp2i() helper 2546 # above, we have to split the exponent in half and do two multiply 2547 # operations. 2548 # 2549 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2550 # twice the full range that is valid for the fexp2i() function above. If 2551 # exp/2 is the bottom value of that range, the fexp2i() expression will 2552 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2553 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2554 # value is clamped from above, then it must have been above the supported 2555 # range of the GLSL built-in and therefore any return value is acceptable. 2556 if bits == 16: 2557 exp = ('imin', ('imax', exp, -30), 30) 2558 elif bits == 32: 2559 exp = ('imin', ('imax', exp, -254), 254) 2560 elif bits == 64: 2561 exp = ('imin', ('imax', exp, -2046), 2046) 2562 else: 2563 assert False 2564 2565 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2566 # (We use ishr which isn't the same for -1, but the -1 case still works 2567 # since we use exp-exp/2 as the second exponent.) While the spec 2568 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2569 # work with denormals and doesn't allow for the full swing in exponents 2570 # that you can get with normalized values. Instead, we create two powers 2571 # of two and multiply by them each in turn. That way the effective range 2572 # of our exponent is doubled. 2573 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2574 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2575 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2576 2577optimizations += [ 2578 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2579 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2580 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2581] 2582 2583# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2584def bitfield_reverse_ue4(u): 2585 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2586 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2587 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2588 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2589 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2590 2591 return step5 2592 2593# Cyberpunk 2077 open-codes bitfieldReverse() 2594def bitfield_reverse_cp2077(u): 2595 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2596 step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2597 step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2598 step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2599 step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2600 2601 return step5 2602 2603optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2604optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2605 2606# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16. 2607# Because the spec does not specify a rounding mode or behaviour regarding infinity, 2608# it emits a sequence to ensure D3D-like behaviour for infinity. 2609# When we know the current backend already behaves like we need, we can eliminate the extra sequence. 2610# 2611# Input is f32, output is u32 that has the f16 packed into its low bits. 2612def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a): 2613 packed_half = ('pack_half_2x16_rtz_split', a, 0) 2614 packed_half_minus1 = ('iadd', packed_half, 0xffffffff) 2615 f32_was_not_inf = ('ine', abs_a, 0x7f800000) 2616 f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00) 2617 return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half) 2618 2619optimizations += [ 2620 (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)), 2621 (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)), 2622 (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)), 2623] 2624 2625def vkd3d_proton_msad(): 2626 pattern = None 2627 for i in range(4): 2628 ref = ('extract_u8', 'a@32', i) 2629 src = ('extract_u8', 'b@32', i) 2630 sad = ('iabs', ('iadd', ref, ('ineg', src))) 2631 msad = ('bcsel', ('ieq', ref, 0), 0, sad) 2632 if pattern == None: 2633 pattern = msad 2634 else: 2635 pattern = ('iadd', pattern, msad) 2636 pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:]) 2637 return pattern 2638 2639optimizations += [ 2640 (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'), 2641 (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)), 2642] 2643 2644 2645# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2646# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2647for ncomp in [2, 3, 4, 8, 16]: 2648 optimizations += [ 2649 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2650 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2651 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2652 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2653 ] 2654 2655# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2656# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2657# and, if a is a NaN then the second comparison will fail anyway. 2658for op in ['flt', 'fge', 'feq']: 2659 optimizations += [ 2660 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 2661 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 2662 ] 2663 2664# Add optimizations to handle the case where the result of a ternary is 2665# compared to a constant. This way we can take things like 2666# 2667# (a ? 0 : 1) > 0 2668# 2669# and turn it into 2670# 2671# a ? (0 > 0) : (1 > 0) 2672# 2673# which constant folding will eat for lunch. The resulting ternary will 2674# further get cleaned up by the boolean reductions above and we will be 2675# left with just the original variable "a". 2676for op in ['feq', 'fneu', 'ieq', 'ine']: 2677 optimizations += [ 2678 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2679 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2680 ] 2681 2682for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 2683 optimizations += [ 2684 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2685 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2686 ((op, '#d', ('bcsel', a, '#b', '#c')), 2687 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 2688 ] 2689 2690 2691# For example, this converts things like 2692# 2693# 1 + mix(0, a - 1, condition) 2694# 2695# into 2696# 2697# mix(1, (a-1)+1, condition) 2698# 2699# Other optimizations will rearrange the constants. 2700for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 2701 optimizations += [ 2702 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 2703 ] 2704 2705# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 2706# states: 2707# 2708# If neither layout qualifier is specified, derivatives in compute shaders 2709# return zero, which is consistent with the handling of built-in texture 2710# functions like texture() in GLSL 4.50 compute shaders. 2711for op in ['fddx', 'fddx_fine', 'fddx_coarse', 2712 'fddy', 'fddy_fine', 'fddy_coarse']: 2713 optimizations += [ 2714 ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 2715] 2716 2717# Some optimizations for ir3-specific instructions. 2718optimizations += [ 2719 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 2720 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 2721 # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'. 2722 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 2723 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 2724] 2725 2726# These kinds of sequences can occur after nir_opt_peephole_select. 2727# 2728# NOTE: fadd is not handled here because that gets in the way of ffma 2729# generation in the i965 driver. Instead, fadd and ffma are handled in 2730# late_optimizations. 2731 2732for op in ['flrp']: 2733 optimizations += [ 2734 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2735 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2736 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2737 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2738 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2739 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2740 ] 2741 2742for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 2743 optimizations += [ 2744 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2745 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2746 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2747 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2748 ] 2749 2750for op in ['fpow']: 2751 optimizations += [ 2752 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2753 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2754 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 2755 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 2756 ] 2757 2758for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']: 2759 optimizations += [ 2760 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 2761 ] 2762 2763for op in ['ineg', 'iabs', 'inot', 'isign']: 2764 optimizations += [ 2765 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 2766 ] 2767 2768optimizations.extend([ 2769 (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 2770 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 2771 (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 2772 ]) 2773 2774 2775""" 2776 if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16) 2777 return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */; 2778 else 2779 return f2f32(f2f16(val)); 2780""" 2781optimizations.extend([ 2782 (('fquantize2f16', 'a@32'), 2783 ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)), 2784 ('iand', a, 1 << 31), 2785 ('!f2f32', ('!f2f16_rtne', a))), 2786 'options->lower_fquantize2f16') 2787 ]) 2788 2789for s in range(0, 31): 2790 mask = 0xffffffff << s 2791 2792 # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior 2793 # will never both have the same bits set, replacing the ior with an iadd 2794 # is safe (i.e., a carry out of a bit can never be generated). The iadd is 2795 # more likely to participate in other optimization patterns (e.g., iadd of 2796 # constant reassociation) 2797 optimizations.extend([ 2798 (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)), 2799 'options->avoid_ternary_with_two_constants'), 2800 ]) 2801 2802# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN. 2803# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here) 2804for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']: 2805 optimizations += [((op, '#a(is_nan)', b), NAN)] 2806 optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative 2807 2808# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN. 2809for op in ['ffma', 'flrp']: 2810 optimizations += [((op, '#a(is_nan)', b, c), NAN)] 2811 optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative 2812 optimizations += [((op, a, b, '#c(is_nan)'), NAN)] 2813 2814# NaN propagation: FP min/max. Pick the non-NaN operand. 2815for op in ['fmin', 'fmax']: 2816 optimizations += [((op, '#a(is_nan)', b), b)] # commutative 2817 2818# NaN propagation: ldexp is NaN if the first operand is NaN. 2819optimizations += [(('ldexp', '#a(is_nan)', b), NAN)] 2820 2821# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN. 2822for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']: 2823 optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative 2824 2825# NaN propagation: FP comparison opcodes except !=. Replace it with false. 2826for op in ['feq', 'fge', 'flt']: 2827 optimizations += [((op, '#a(is_nan)', b), False)] 2828 optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative 2829 2830# NaN propagation: FP comparison opcodes using !=. Replace it with true. 2831# Operator != is the only opcode where a comparison with NaN returns true. 2832for op in ['fneu']: 2833 optimizations += [((op, '#a(is_nan)', b), True)] # commutative 2834 2835# NaN propagation: FP comparison opcodes except != returning FP 0 or 1. 2836for op in ['seq', 'sge', 'slt']: 2837 optimizations += [((op, '#a(is_nan)', b), 0.0)] 2838 optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative 2839 2840# NaN propagation: FP comparison opcodes using != returning FP 0 or 1. 2841# Operator != is the only opcode where a comparison with NaN returns true. 2842optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative 2843 2844# This section contains optimizations to propagate downsizing conversions of 2845# constructed vectors into vectors of downsized components. Whether this is 2846# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 2847# this reduces the register pressure of the vector itself and often enables the 2848# conversions to be eliminated via other algebraic rules or constant folding. 2849# In the worst case on a SIMD architecture, the propagated conversions may be 2850# revectorized via nir_opt_vectorize so instruction count is minimally 2851# impacted. 2852# 2853# On a machine with SIMD-within-a-register only, this actually 2854# counterintuitively hurts instruction count. These machines are the same that 2855# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 2856# not being set. 2857# 2858# Finally for scalar architectures, there should be no difference in generated 2859# code since it all ends up scalarized at the end, but it might minimally help 2860# compile-times. 2861 2862for i in range(2, 4 + 1): 2863 for T in ('f', 'u', 'i'): 2864 vec_inst = ('vec' + str(i),) 2865 2866 indices = ['a', 'b', 'c', 'd'] 2867 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 2868 2869 to_16 = '{}2{}16'.format(T, T) 2870 to_mp = '{}2{}mp'.format(T, T) 2871 2872 out_16 = tuple((to_16, indices[j]) for j in range(i)) 2873 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 2874 2875 optimizations += [ 2876 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 2877 ] 2878 # u2ump doesn't exist, because it's equal to i2imp 2879 if T in ['f', 'i']: 2880 optimizations += [ 2881 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 2882 ] 2883 2884# This section contains "late" optimizations that should be run before 2885# creating ffmas and calling regular optimizations for the final time. 2886# Optimizations should go here if they help code generation and conflict 2887# with the regular optimizations. 2888before_ffma_optimizations = [ 2889 # Propagate constants down multiplication chains 2890 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 2891 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 2892 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 2893 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 2894 2895 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 2896 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 2897 (('~fadd', ('fneg', a), a), 0.0), 2898 (('iadd', ('ineg', a), a), 0), 2899 (('iadd', ('ineg', a), ('iadd', a, b)), b), 2900 (('iadd', a, ('iadd', ('ineg', a), b)), b), 2901 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 2902 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 2903 2904 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 2905 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 2906 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 2907] 2908 2909# This section contains "late" optimizations that should be run after the 2910# regular optimizations have finished. Optimizations should go here if 2911# they help code generation but do not necessarily produce code that is 2912# more easily optimizable. 2913late_optimizations = [ 2914 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 2915 # results if one operand is +Inf and the other is -Inf. 2916 # 2917 # 1. Inf + -Inf = NaN 2918 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 2919 # 3. ∀x: x != NaN = true 2920 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 2921 # 2922 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 2923 # (a+b) < 0 false false false false 2924 # a < -b false false false false 2925 # -(a+b) < 0 false false false false 2926 # -a < b false false false false 2927 # (a+b) >= 0 false false false false 2928 # a >= -b true true false false 2929 # -(a+b) >= 0 false false false false 2930 # -a >= b true true false false 2931 # (a+b) == 0 false false false false 2932 # a == -b true true false false 2933 # (a+b) != 0 true true true true 2934 # a != -b false false true true 2935 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 2936 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 2937 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 2938 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 2939 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 2940 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 2941 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 2942 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 2943 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 2944 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 2945 2946 # If either source must be finite, then the original (a+b) cannot produce 2947 # NaN due to Inf-Inf. The patterns and the replacements produce the same 2948 # result if b is NaN. Therefore, the replacements are exact. 2949 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 2950 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 2951 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 2952 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 2953 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 2954 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 2955 2956 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 2957 # SpvOpLessOrGreater. 2958 (('iand', ('fneu', a, b), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))), 2959 (('iand', ('fneu', a, 0.0), ('feq', a, a) ), ('!flt', 0.0, ('fabs', a))), 2960 2961 # This is how SpvOpFUnordEqual might be implemented. Replace it with 2962 # !SpvOpLessOrGreater. 2963 (('ior', ('feq', a, b), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))), 2964 (('ior', ('feq', a, 0.0), ('fneu', a, a), ), ('inot', ('!flt', 0.0, ('fabs', a)))), 2965 2966 # nir_lower_to_source_mods will collapse this, but its existence during the 2967 # optimization loop can prevent other optimizations. 2968 (('fneg', ('fneg', a)), a) 2969] 2970 2971# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 2972# gets combined to fma(a, b, -c). 2973for sz, mulz in itertools.product([16, 32, 64], [False, True]): 2974 # fmulz/ffmaz only for fp32 2975 if mulz and sz != 32: 2976 continue 2977 2978 # Fuse the correct fmul. Only consider fmuls where the only users are fadd 2979 # (or fneg/fabs which are assumed to be propagated away), as a heuristic to 2980 # avoid fusing in cases where it's harmful. 2981 fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)' 2982 ffma = 'ffmaz' if mulz else 'ffma' 2983 2984 fadd = '~fadd@{}'.format(sz) 2985 option = 'options->fuse_ffma{}'.format(sz) 2986 2987 late_optimizations.extend([ 2988 ((fadd, (fmul, a, b), c), (ffma, a, b, c), option), 2989 2990 ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c), 2991 (ffma, ('fneg', a), b, c), option), 2992 2993 ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c), 2994 (ffma, ('fabs', a), ('fabs', b), c), option), 2995 2996 ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c), 2997 (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option), 2998 ]) 2999 3000late_optimizations.extend([ 3001 # Subtractions get lowered during optimization, so we need to recombine them 3002 (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3003 (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3004 (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3005 (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 3006 3007 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 3008 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 3009 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 3010 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 3011 # On Intel GPUs, the constant field for an ADD3 instruction must be either 3012 # int16_t or uint16_t. 3013 (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), 3014 (('iadd', ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), 3015 (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'), ('iadd3', a, b, c), 'options->has_iadd3'), 3016 (('iadd', ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'), 3017 (('iadd', ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'), 3018 (('iadd', ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'), ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'), 3019 3020 # fneg_lo / fneg_hi 3021 (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 3022 (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 3023 3024 # These are duplicated from the main optimizations table. The late 3025 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 3026 # new patterns like these. The patterns that compare with zero are removed 3027 # because they are unlikely to be created in by anything in 3028 # late_optimizations. 3029 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 3030 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 3031 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 3032 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 3033 3034 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 3035 3036 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 3037 3038 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 3039 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 3040 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 3041 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 3042 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 3043 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 3044 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 3045 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 3046 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 3047 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 3048 3049 (('ior', a, a), a), 3050 (('iand', a, a), a), 3051 3052 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 3053 3054 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 3055 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 3056 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 3057 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 3058 3059 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 3060 3061 # Approximate handling of fround_even for DX9 addressing from gallium nine on 3062 # DX9-class hardware with no proper fround support. This is in 3063 # late_optimizations so that the is_integral() opts in the main pass get a 3064 # chance to eliminate the fround_even first. 3065 (('fround_even', a), ('bcsel', 3066 ('feq', ('ffract', a), 0.5), 3067 ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 3068 ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 3069 3070 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 3071 # particular operation is common for expanding values stored in a texture 3072 # from [0,1] to [-1,1]. 3073 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3074 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3075 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3076 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3077 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3078 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3079 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3080 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3081 3082 # flrp(a, b, a) 3083 # a*(1-a) + b*a 3084 # a + -a*a + a*b (1) 3085 # a + a*(b - a) 3086 # Option 1: ffma(a, (b-a), a) 3087 # 3088 # Alternately, after (1): 3089 # a*(1+b) + -a*a 3090 # a*((1+b) + -a) 3091 # 3092 # Let b=1 3093 # 3094 # Option 2: ffma(a, 2, -(a*a)) 3095 # Option 3: ffma(a, 2, (-a)*a) 3096 # Option 4: ffma(a, -a, (2*a) 3097 # Option 5: a * (2 - a) 3098 # 3099 # There are a lot of other possible combinations. 3100 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 3101 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3102 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3103 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3104 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3105 3106 # we do these late so that we don't get in the way of creating ffmas 3107 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 3108 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 3109 3110 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 3111 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 3112 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 3113 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 3114 3115 # Things that look like DPH in the source shader may get expanded to 3116 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 3117 # to NIR. After FFMA is generated, this can look like: 3118 # 3119 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 3120 # 3121 # Reassociate the last addition into the first multiplication. 3122 # 3123 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 3124 # shader stages on some outputs that are intended to be invariant. For 3125 # various reasons, this optimization may not be fully applied in all 3126 # shaders used for different rendering passes of the same geometry. This 3127 # can result in Z-fighting artifacts (at best). For now, disable this 3128 # optimization in these stages. See bugzilla #111490. In tessellation 3129 # stages applications seem to use 'precise' when necessary, so allow the 3130 # optimization in those stages. 3131 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3132 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3133 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3134 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3135 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3136 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3137 3138 (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3139 ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3140 (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3141 ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3142 (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3143 ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3144 3145 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 3146 # 3147 # If bits is zero, the result will be zero. 3148 # 3149 # These prevent the next two lowerings generating incorrect results when 3150 # count is zero. 3151 (('ubfe', a, b, 0), 0), 3152 (('ibfe', a, b, 0), 0), 3153 3154 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 3155 # instructions on Intel GPUs, it cannot have an immediate values as 3156 # sources. There are also limitations on source register strides. As a 3157 # result, it is very easy for 3-source instruction combined with either 3158 # loads of immediate values or copies from weird register strides to be 3159 # more expensive than the primitive instructions it represents. 3160 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 3161 3162 # b is the lowest order bit to be extracted and c is the number of bits to 3163 # extract. The inner shift removes the bits above b + c by shifting left 3164 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 3165 # -(b + c). The outer shift moves the bit that was at b to bit zero. 3166 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 3167 # This means that it must be shifted right by 32 - c or -c bits. 3168 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 3169 3170 # Clean up no-op shifts that may result from the bfe lowerings. 3171 (('ishl', a, 0), a), 3172 (('ishl', a, -32), a), 3173 (('ishr', a, 0), a), 3174 (('ishr', a, -32), a), 3175 (('ushr', a, 0), a), 3176 3177 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 3178 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 3179 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 3180 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 3181 3182 # open coded bit test 3183 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3184 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3185 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3186 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3187 (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'), 3188 (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'), 3189 (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'), 3190 (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'), 3191 (('bitz', ('ushr', a, b), 0), ('bitz', a, b)), 3192 (('bitz', ('ishr', a, b), 0), ('bitz', a, b)), 3193 (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)), 3194 (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)), 3195 (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3196 (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3197 (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'), 3198 (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'), 3199 (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3200 (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3201 (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'), 3202 (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'), 3203 (('inot', ('bitnz', a, b)), ('bitz', a, b)), 3204 (('inot', ('bitz', a, b)), ('bitnz', a, b)), 3205 (('bitnz', ('inot', a), b), ('bitz', a, b)), 3206 (('bitz', ('inot', a), b), ('bitnz', a, b)), 3207]) 3208 3209# A few more extract cases we'd rather leave late 3210for N in [16, 32]: 3211 aN = 'a@{0}'.format(N) 3212 u2uM = 'u2u{0}'.format(M) 3213 i2iM = 'i2i{0}'.format(M) 3214 3215 for x in ['u', 'i']: 3216 x2xN = '{0}2{0}{1}'.format(x, N) 3217 extract_x8 = 'extract_{0}8'.format(x) 3218 extract_x16 = 'extract_{0}16'.format(x) 3219 3220 late_optimizations.extend([ 3221 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3222 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3223 ]) 3224 3225 if N > 16: 3226 late_optimizations.extend([ 3227 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3228 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3229 ]) 3230 3231# Byte insertion 3232late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3233late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3234late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 3235 3236late_optimizations += [ 3237 # Word insertion 3238 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 3239 3240 # Extract and then insert 3241 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 3242 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 3243] 3244 3245# Integer sizes 3246for s in [8, 16, 32, 64]: 3247 late_optimizations.extend([ 3248 (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)), 3249 (('ior', ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)), 3250 ]) 3251 3252# Float sizes 3253for s in [16, 32, 64]: 3254 late_optimizations.extend([ 3255 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 3256 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 3257 ]) 3258 3259for op in ['fadd']: 3260 late_optimizations += [ 3261 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3262 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3263 ] 3264 3265for op in ['ffma', 'ffmaz']: 3266 late_optimizations += [ 3267 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3268 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3269 3270 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3271 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3272 ] 3273 3274# mediump: If an opcode is surrounded by conversions, remove the conversions. 3275# The rationale is that type conversions + the low precision opcode are more 3276# expensive that the same arithmetic opcode at higher precision. 3277# 3278# This must be done in late optimizations, because we need normal optimizations to 3279# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 3280# 3281# Unary opcodes 3282for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 3283 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 3284 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 3285 3286# Binary opcodes 3287for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 3288 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 3289 3290# Ternary opcodes 3291for op in ['ffma', 'flrp']: 3292 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 3293 3294# Comparison opcodes 3295for op in ['feq', 'fge', 'flt', 'fneu']: 3296 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 3297 3298# Do this last, so that the f2fmp patterns above have effect. 3299late_optimizations += [ 3300 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 3301 # any conversions that could have been removed will have been removed in 3302 # nir_opt_algebraic so any remaining ones are required. 3303 (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"), 3304 (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"), 3305 (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"), 3306 (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"), 3307 (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"), 3308 (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"), 3309 (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"), 3310 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 3311 3312 (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"), 3313 3314 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3315 (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3316 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3317 (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3318 3319 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3320 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3321 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3322 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3323 3324 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3325 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3326 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3327 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3328] 3329 3330distribute_src_mods = [ 3331 # Try to remove some spurious negations rather than pushing them down. 3332 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 3333 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 3334 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 3335 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 3336 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 3337 (('fneg', ('fneg', a)), a), 3338 3339 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 3340 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 3341 3342 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 3343 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 3344 (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 3345 3346 # Note that fmin <-> fmax. I don't think there is a way to distribute 3347 # fabs() into fmin or fmax. 3348 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 3349 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 3350 3351 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 3352 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 3353 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 3354 3355 # fdph works mostly like fdot, but to get the correct result, the negation 3356 # must be applied to the second source. 3357 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 3358 3359 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 3360 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 3361] 3362 3363print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 3364print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 3365 before_ffma_optimizations).render()) 3366print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 3367 late_optimizations).render()) 3368print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 3369 distribute_src_mods).render()) 3370