• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23
24from collections import OrderedDict
25import nir_algebraic
26from nir_opcodes import type_sizes
27import itertools
28import struct
29from math import pi
30import math
31
32# Convenience variables
33a = 'a'
34b = 'b'
35c = 'c'
36d = 'd'
37e = 'e'
38NAN = math.nan
39
40signed_zero_preserve_16 = 'nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 16)'
41signed_zero_preserve_32 = 'nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 32)'
42signed_zero_nan_preserve_16 = ('(nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 16) ||'
43                               ' nir_is_float_control_nan_preserve(info->float_controls_execution_mode, 16))')
44signed_zero_nan_preserve_32 = ('(nir_is_float_control_signed_zero_preserve(info->float_controls_execution_mode, 32) ||'
45                               ' nir_is_float_control_nan_preserve(info->float_controls_execution_mode, 32))')
46signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)'
47signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)'
48
49has_fmulz = '(options->has_fmulz || \
50              (options->has_fmulz_no_denorms && \
51               !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))'
52
53ignore_exact = nir_algebraic.ignore_exact
54
55# Written in the form (<search>, <replace>) where <search> is an expression
56# and <replace> is either an expression or a value.  An expression is
57# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
58# where each source is either an expression or a value.  A value can be
59# either a numeric constant or a string representing a variable name.
60#
61# If the opcode in a search expression is prefixed by a '~' character, this
62# indicates that the operation is inexact.  Such operations will only get
63# applied to SSA values that do not have the exact bit set.  This should be
64# used by by any optimizations that are not bit-for-bit exact.  It should not,
65# however, be used for backend-requested lowering operations as those need to
66# happen regardless of precision.
67#
68# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
69# "#" indicates that the given variable will only match constants,
70# type indicates that the given variable will only match values from ALU
71#    instructions with the given output type,
72# (cond) specifies an additional condition function (see nir_search_helpers.h),
73# swiz is a swizzle applied to the variable (only in the <replace> expression)
74#
75# For constants, you have to be careful to make sure that it is the right
76# type because python is unaware of the source and destination types of the
77# opcodes.
78#
79# All expression types can have a bit-size specified.  For opcodes, this
80# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
81# type and size.  In the search half of the expression this indicates that it
82# should only match that particular bit-size.  In the replace half of the
83# expression this indicates that the constructed value should have that
84# bit-size.
85#
86# If the opcode in a replacement expression is prefixed by a '!' character,
87# this indicated that the new expression will be marked exact.
88#
89# A special condition "many-comm-expr" can be used with expressions to note
90# that the expression and its subexpressions have more commutative expressions
91# than nir_replace_instr can handle.  If this special condition is needed with
92# another condition, the two can be separated by a comma (e.g.,
93# "(many-comm-expr,is_used_once)").
94
95# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
96def lowered_sincos(c):
97    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
98    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
99    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
100
101def intBitsToFloat(i):
102    return struct.unpack('!f', struct.pack('!I', i))[0]
103
104optimizations = [
105
106   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
107   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
108   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
109   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
110   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
111   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
112   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
113
114   (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))),
115    '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'),
116
117   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
118   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
119   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
120   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
121   (('udiv', a, 1), a),
122   (('idiv', a, 1), a),
123   (('umod', a, 1), 0),
124   (('imod', a, 1), 0),
125   (('imod', a, -1), 0),
126   (('irem', a, 1), 0),
127   (('irem', a, -1), 0),
128   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
129   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
130   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
131   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
132   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
133   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
134   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
135   (('irem', a, '#b(is_pos_power_of_two)'),
136    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
137    '!options->lower_bitops'),
138   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
139
140   (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'),
141
142   (('~fneg', ('fneg', a)), a),
143   (('ineg', ('ineg', a)), a),
144   (('fabs', ('fneg', a)), ('fabs', a)),
145   (('fabs', ('u2f', a)), ('u2f', a)),
146   (('iabs', ('iabs', a)), ('iabs', a)),
147   (('iabs', ('ineg', a)), ('iabs', a)),
148   (('~fadd', a, 0.0), a),
149   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
150   # floating point instruction, they should flush any input denormals and we
151   # can replace -0.0 with 0.0 if the float execution mode allows it.
152   (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_preserve_16),
153   (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_preserve_32),
154   (('iadd', a, 0), a),
155   (('iadd_sat', a, 0), a),
156   (('isub_sat', a, 0), a),
157   (('uadd_sat', a, 0), a),
158   (('usub_sat', a, 0), a),
159   (('usadd_4x8_vc4', a, 0), a),
160   (('usadd_4x8_vc4', a, ~0), ~0),
161   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
162   (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
163   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
164   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
165   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
166   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
167   (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
168   (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
169   (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
170   (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
171   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
172   (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)),
173   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
174   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
175   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)),
176   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)),
177   (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))),
178   (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))),
179   (('~fadd', ('fneg', a), a), 0.0),
180   (('iadd', ('ineg', a), a), 0),
181   (('iadd', ('ineg', a), ('iadd', a, b)), b),
182   (('iadd', a, ('iadd', ('ineg', a), b)), b),
183   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
184   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
185   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
186   (('~fmul', a, 0.0), 0.0),
187   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
188   (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_nan_preserve_16),
189   (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_nan_preserve_32),
190   (('fmulz', a, 0.0), 0.0),
191   (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_preserve_32),
192   (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
193   (('fmulz', a, a), ('fmul', a, a)),
194   (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_preserve_32),
195   (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
196   (('ffmaz', a, a, b), ('ffma', a, a, b)),
197   (('imul', a, 0), 0),
198   (('umul_unorm_4x8_vc4', a, 0), 0),
199   (('umul_unorm_4x8_vc4', a, ~0), a),
200   (('~fmul', a, 1.0), a),
201   (('~fmulz', a, 1.0), a),
202   # The only effect a*1.0 can have is flushing denormals. If it's only used by
203   # a floating point instruction, they should flush any input denormals and
204   # this multiplication isn't needed.
205   (('fmul(is_only_used_as_float)', a, 1.0), a),
206   (('imul', a, 1), a),
207   (('fmul', a, -1.0), ('fneg', a)),
208   (('imul', a, -1), ('ineg', a)),
209   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
210   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
211   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
212   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
213   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
214   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
215   (('~ffma', 0.0, a, b), b),
216   (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),
217   (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),
218   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
219   (('~ffma', a, b, 0.0), ('fmul', a, b)),
220   (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_preserve_16),
221   (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_preserve_32),
222   (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_preserve_32),
223   (('ffma', 1.0, a, b), ('fadd', a, b)),
224   (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_preserve_32),
225   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
226   (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_preserve_32),
227   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
228   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
229   (('~flrp', a, b, 0.0), a),
230   (('~flrp', a, b, 1.0), b),
231   (('~flrp', a, a, b), a),
232   (('~flrp', 0.0, a, b), ('fmul', a, b)),
233
234   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
235   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
236
237   (('sdot_4x8_iadd', a, 0, b), b),
238   (('udot_4x8_uadd', a, 0, b), b),
239   (('sdot_4x8_iadd_sat', a, 0, b), b),
240   (('udot_4x8_uadd_sat', a, 0, b), b),
241   (('sdot_2x16_iadd', a, 0, b), b),
242   (('udot_2x16_uadd', a, 0, b), b),
243   (('sdot_2x16_iadd_sat', a, 0, b), b),
244   (('udot_2x16_uadd_sat', a, 0, b), b),
245
246   # sudot_4x8_iadd is not commutative at all, so the patterns must be
247   # duplicated with zeros on each of the first positions.
248   (('sudot_4x8_iadd', a, 0, b), b),
249   (('sudot_4x8_iadd', 0, a, b), b),
250   (('sudot_4x8_iadd_sat', a, 0, b), b),
251   (('sudot_4x8_iadd_sat', 0, a, b), b),
252
253   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
254   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
255   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
256   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
257   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
258
259   # Try to let constant folding eliminate the dot-product part.  These are
260   # safe because the dot product cannot overflow 32 bits.
261   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
262   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
263   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
264   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
265   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
266   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
267   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
268   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
269   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
270   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
271   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
272   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
273   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
274   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
275   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
276   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
277
278   # Optimize open-coded fmulz.
279   # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)
280   (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)),
281    ('fmulz', a, b), has_fmulz+' && !'+signed_zero_preserve_32),
282   (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),
283    ('fmulz', a, b), has_fmulz+' && !'+signed_zero_preserve_32),
284
285   # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)
286   (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c),
287    ('ffmaz', a, b, c), has_fmulz+' && !'+signed_zero_preserve_32),
288   (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),
289    ('ffmaz', a, b, c), has_fmulz+' && !'+signed_zero_preserve_32),
290
291   # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b))
292   (('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, b))),
293    ('fexp2', ('fmulz', a, b)),
294    has_fmulz+' && !'+signed_zero_inf_nan_preserve_32),
295]
296
297# Shorthand for the expansion of just the dot product part of the [iu]dp4a
298# instructions.
299sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
300                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
301                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
302                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
303udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
304                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
305                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
306                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
307sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
308                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
309                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
310                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
311sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
312                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
313udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
314                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
315
316optimizations.extend([
317   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
318   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
319   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
320   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
321   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
322
323   # For the unsigned dot-product, the largest possible value 4*(255*255) =
324   # 0x3f804, so we don't have to worry about that intermediate result
325   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
326   # that is less than 0xfffc07fc, then the result cannot overflow ever.
327   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
328   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'),
329
330   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
331   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
332   # don't have to worry about that intermediate result overflowing or
333   # underflowing.
334   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'),
335
336   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'),
337
338   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
339   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
340])
341
342# Float sizes
343for s in [16, 32, 64]:
344    optimizations.extend([
345       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
346
347       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
348       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
349       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
350
351       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
352       # These are the same as the previous three rules, but it depends on
353       # 1-fsat(x) <=> fsat(1-x).  See below.
354       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
355       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
356
357       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
358       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
359
360       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
361       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
362
363       # These two aren't flrp lowerings, but do appear in some shaders.
364       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
365       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
366
367       # 1 - ((1 - a) * (1 - b))
368       # 1 - (1 - a - b + a*b)
369       # 1 - 1 + a + b - a*b
370       # a + b - a*b
371       # a + b*(1 - a)
372       # b*(1 - a) + 1*a
373       # flrp(b, 1, a)
374       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
375    ])
376
377optimizations.extend([
378   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
379
380   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
381
382   (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
383   (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
384   (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)'),
385
386   (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
387   (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
388   (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'),
389   (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'),
390   (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'),
391   (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
392   (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
393   (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
394   (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
395   (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
396   (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
397   (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'),
398   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
399   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
400   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
401   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
402   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
403   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
404   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
405   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
406   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
407   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
408
409   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
410    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
411
412   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
413
414   (('fdot4', a, 0.0), 0.0),
415   (('fdot3', a, 0.0), 0.0),
416   (('fdot2', a, 0.0), 0.0),
417
418   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
419   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
420   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
421   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
422
423   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
424   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
425
426   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
427   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
428
429   # Lower fdot to fsum when it is available
430   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
431   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
432   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
433   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
434
435   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
436   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
437   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
438   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
439
440   # (a * #b + #c) << #d
441   # ((a * #b) << #d) + (#c << #d)
442   # (a * (#b << #d)) + (#c << #d)
443   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
444    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
445
446   # (a * #b) << #c
447   # a * (#b << #c)
448   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
449])
450
451# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
452# bits of the second source.  These replacements must correctly handle the
453# case where (b % bitsize) + (c % bitsize) >= bitsize.
454for s in [8, 16, 32, 64]:
455   mask = s - 1
456
457   ishl = "ishl@{}".format(s)
458   ishr = "ishr@{}".format(s)
459   ushr = "ushr@{}".format(s)
460
461   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
462
463   optimizations.extend([
464       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
465       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
466
467       # To get get -1 for large shifts of negative values, ishr must instead
468       # clamp the shift count to the maximum value.
469       ((ishr, (ishr, a, '#b'), '#c'),
470        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
471   ])
472
473# Optimize a pattern of address calculation created by DXVK where the offset is
474# divided by 4 and then multipled by 4. This can be turned into an iand and the
475# additions before can be reassociated to CSE the iand instruction.
476
477for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
478    a_sz = 'a@{}'.format(size)
479
480    optimizations.extend([
481       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
482       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
483       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
484
485       # This does not trivially work with ishr.
486       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
487    ])
488
489optimizations.extend([
490    (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),
491    (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),
492])
493
494for log2 in range(1, 7): # powers of two from 2 to 64
495   v = 1 << log2
496   mask = 0xffffffff & ~(v - 1)
497   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
498
499   optimizations.extend([
500       # Reassociate for improved CSE
501       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
502   ])
503
504# To save space in the state tables, reduce to the set that is known to help.
505# Previously, this was range(1, 32).  In addition, a couple rules inside the
506# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
507# resolution.
508for i in [1, 2, 16, 24]:
509    lo_mask = 0xffffffff >> i
510    hi_mask = (0xffffffff << i) & 0xffffffff
511
512    optimizations.extend([
513        # This pattern seems to only help in the soft-fp64 code.
514        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
515#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
516#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
517
518        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
519        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
520#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
521    ])
522
523optimizations.extend([
524   # This is common for address calculations.  Reassociating may enable the
525   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
526   # instruction or a constant offset field for in load / store instructions.
527   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
528
529   # (a + #b) * #c => (a * #c) + (#b * #c)
530   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
531
532   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
533   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
534    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
535   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
536    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
537
538   # Comparison simplifications
539   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
540   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
541   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
542   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
543   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
544   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
545   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
546   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
547   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
548   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
549
550   (('iand', ('feq', a, b), ('fneu', a, b)), False),
551   (('iand', ('flt', a, b), ('flt', b, a)), False),
552   (('iand', ('ieq', a, b), ('ine', a, b)), False),
553   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
554   (('iand', ('ult', a, b), ('ult', b, a)), False),
555
556   # This helps some shaders because, after some optimizations, they end up
557   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
558   # matching would be handled by CSE.
559   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
560   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
561   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
562   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
563   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
564   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
565   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
566   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
567   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
568   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
569
570   (('ieq', ('ineg', a), 0),  ('ieq', a, 0)),
571   (('ine', ('ineg', a), 0),  ('ine', a, 0)),
572   (('ieq', ('iabs', a), 0),  ('ieq', a, 0)),
573   (('ine', ('iabs', a), 0),  ('ine', a, 0)),
574
575   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
576   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
577
578   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
579   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
580
581   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
582   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
583
584   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
585   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
586
587   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
588   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
589
590   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
591   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
592
593   # 0.0 >= b2f(a)
594   # b2f(a) <= 0.0
595   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
596   # inot(a)
597   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
598
599   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
600
601   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
602   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
603   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
604   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
605   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
606   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
607   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
608   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
609   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
610   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
611   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
612   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
613   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
614   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
615   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
616   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
617
618   # -(b2f(a) + b2f(b)) < 0
619   # 0 < b2f(a) + b2f(b)
620   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
621   # a || b
622   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
623   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
624
625   # -(b2f(a) + b2f(b)) >= 0
626   # 0 >= b2f(a) + b2f(b)
627   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
628   # !(a || b)
629   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
630   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
631
632   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
633   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
634
635   # Some optimizations (below) convert things like (a < b || c < b) into
636   # (min(a, c) < b).  However, this interfers with the previous optimizations
637   # that try to remove comparisons with negated sums of b2f.  This just
638   # breaks that apart.
639   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
640    ('ior', ('flt', c, 0.0), ('ior', a, b))),
641
642   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
643   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
644   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
645   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
646   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
647   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
648   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
649   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
650   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
651   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
652   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
653   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
654
655   # Cannot remove the addition from ilt or ige due to overflow.
656   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
657   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
658
659   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
660   (('fneu', ('b2f', 'a@1'), 0.0), a),
661   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
662   (('ine', ('b2i', 'a@1'), 0),   a),
663   (('ieq', 'a@1', False), ('inot', a)),
664   (('ieq', 'a@1', True), a),
665   (('ine', 'a@1', False), a),
666   (('ine', 'a@1', True), ('inot', a)),
667
668   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
669   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
670   (('fge', ('u2f', a), 0.0), True),
671   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
672   (('flt', ('u2f', a), 0.0), False),
673   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
674   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
675   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
676   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
677   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
678   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
679   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
680
681   # 0.0 < fabs(a)
682   # fabs(a) > 0.0
683   # fabs(a) != 0.0 because fabs(a) must be >= 0
684   # a != 0.0
685   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
686
687   # -fabs(a) < 0.0
688   # fabs(a) > 0.0
689   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
690
691   # 0.0 >= fabs(a)
692   # 0.0 == fabs(a)   because fabs(a) must be >= 0
693   # 0.0 == a
694   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
695
696   # -fabs(a) >= 0.0
697   # 0.0 >= fabs(a)
698   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
699
700   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
701   #
702   # This should be NaN safe.
703   #
704   # NaN >= 0 && 1 >= NaN -> false && false -> false
705   #
706   # vs.
707   #
708   # NaN == fsat(NaN) -> NaN == 0 -> false
709   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
710
711   # Note: fmin(-a, -b) == -fmax(a, b)
712   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
713   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
714   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
715   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
716
717   # fmin(b2f(a), b)
718   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
719   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
720   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
721   #
722   # Since b is a constant, constant folding will eliminate the fmin and the
723   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
724   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
725
726   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
727
728   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
729   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
730   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
731   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
732   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
733   (('bcsel', ('ult', b, a), b, a), ('umin', a, b)),
734   (('bcsel', ('ult', a, b), b, a), ('umax', a, b)),
735   (('bcsel', ('uge', a, b), b, a), ('umin', a, b)),
736   (('bcsel', ('uge', b, a), b, a), ('umax', a, b)),
737   (('bcsel', ('ilt', b, a), b, a), ('imin', a, b)),
738   (('bcsel', ('ilt', a, b), b, a), ('imax', a, b)),
739   (('bcsel', ('ige', a, b), b, a), ('imin', a, b)),
740   (('bcsel', ('ige', b, a), b, a), ('imax', a, b)),
741   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
742   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
743   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
744   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
745   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
746   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
747   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
748   (('bcsel', a, True, b), ('ior', a, b)),
749   (('bcsel', a, a, b), ('ior', a, b)),
750   (('bcsel', a, b, False), ('iand', a, b)),
751   (('bcsel', a, b, a), ('iand', a, b)),
752   (('~fmin', a, a), a),
753   (('~fmax', a, a), a),
754   (('imin', a, a), a),
755   (('imax', a, a), a),
756   (('umin', a, a), a),
757   (('umin', a, 0), 0),
758   (('umin', a, -1), a),
759   (('umax', a, a), a),
760   (('umax', a, 0), a),
761   (('umax', a, -1), -1),
762   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
763   (('umax', ('umax', a, b), b), ('umax', a, b)),
764   (('imax', ('imax', a, b), b), ('imax', a, b)),
765   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
766   (('umin', ('umin', a, b), b), ('umin', a, b)),
767   (('imin', ('imin', a, b), b), ('imin', a, b)),
768   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
769   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
770   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
771   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
772   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
773   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
774])
775
776for N in [8, 16, 32, 64]:
777    b2iN = 'b2i{0}'.format(N)
778    optimizations.extend([
779        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
780        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
781    ])
782
783for N in [16, 32, 64]:
784    b2fN = 'b2f{0}'.format(N)
785    optimizations.extend([
786        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
787        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
788    ])
789
790# Integer sizes
791for s in [8, 16, 32, 64]:
792    optimizations.extend([
793       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
794
795       # Simplify logic to detect sign of an integer.
796       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
797       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
798       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
799       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
800       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
801       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
802       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
803       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
804       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
805       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
806       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
807       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
808    ])
809
810optimizations.extend([
811   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
812   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
813   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
814   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
815   (('~fmin', a, ('fabs', a)), a),
816   (('imin', a, ('iabs', a)), a),
817   (('~fmax', a, ('fneg', ('fabs', a))), a),
818   (('imax', a, ('ineg', ('iabs', a))), a),
819   (('fmax', a, ('fabs', a)), ('fabs', a)),
820   (('imax', a, ('iabs', a)), ('iabs', a)),
821   (('fmax', a, ('fneg', a)), ('fabs', a)),
822   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
823   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
824   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
825   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
826   # fsat(a) returns 0.0.
827   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
828   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
829   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
830   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
831   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
832   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
833   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
834   # representing this in the optimizations other than the usual ~.
835   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
836   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
837   # the new comparison precise to prevent it being changed to 'a != 0'.
838   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
839   (('fsat', ('b2f', a)), ('b2f', a)),
840   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
841   (('fsat', ('fsat', a)), ('fsat', a)),
842   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
843   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
844   (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_preserve_32),
845   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
846   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
847   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
848   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
849   # Both the left and right patterns are "b" when isnan(a), so this is exact.
850   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
851   (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))),
852   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
853   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
854   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
855
856   # max(-min(b, a), b) -> max(abs(b), -a)
857   # min(-max(b, a), b) -> min(-abs(b), -a)
858   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
859   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
860
861   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
862   # fsat(b-a).
863   #
864   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
865   #
866   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
867   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
868    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
869
870   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
871
872   # The ior versions are exact because fmin and fmax will always pick a
873   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
874   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
875   # to prevent other optimizations from ruining the "NaN clensing" property
876   # of the fmin or fmax.
877   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
878   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
879   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
880   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
881   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
882   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
883   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
884   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
885   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
886   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
887   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
888   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
889   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
890   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
891   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
892   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
893
894   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
895   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
896   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
897   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
898   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
899   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
900   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
901   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
902   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
903   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
904   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
905   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
906   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
907   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
908   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
909   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
910
911   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
912   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
913   # single step.  Doing just the replacement can lead to an infinite loop as
914   # the pattern is repeatedly applied to the result of the previous
915   # application of the pattern.
916   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
917   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
918   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
919   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
920
921   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
922   # numbers, then it can be replaced with fneu.
923   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
924
925   # Other patterns may optimize the resulting iand tree further.
926   (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)),
927    ('iand', ('iand', a, b), ('iand', c, b))),
928])
929
930# Float sizes
931for s in [16, 32, 64]:
932    if s == 64:
933        match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)"
934    else:
935        match_fsign_cond = "!options->lower_fsign"
936    optimizations.extend([
937       # These derive from the previous patterns with the application of b < 0 <=>
938       # 0 < -b.  The transformation should be applied if either comparison is
939       # used once as this ensures that the number of comparisons will not
940       # increase.  The sources to the ior and iand are not symmetric, so the
941       # rules have to be duplicated to get this behavior.
942       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
943       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
944       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
945       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
946       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
947       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
948       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
949       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
950
951       (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
952       (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
953       (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
954       (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
955
956       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
957       # with the bcsel, it's basically copysign(1.0, a).  There are some
958       # behavior differences between this pattern and copysign w.r.t. ±0 and
959       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
960       # to x, regardless of whether either or both values are NaN.
961       #
962       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
963       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
964       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
965       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
966       #
967       # For all other values of 'a', the original and replacement behave as
968       # copysign.
969       #
970       # Marking the replacement comparisons as precise prevents any future
971       # optimizations from replacing either of the comparisons with the
972       # logical-not of the other.
973       #
974       # Note: Use b2i32 in the replacement because some platforms that
975       # support fp16 don't support int16.
976       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
977        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
978
979       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
980
981       # The C spec says, "If the value of the integral part cannot be represented
982       # by the integer type, the behavior is undefined."  "Undefined" can mean
983       # "the conversion doesn't happen at all."
984       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
985
986       # Ironically, mark these as imprecise because removing the conversions may
987       # preserve more precision than doing the conversions (e.g.,
988       # uint(float(0x81818181u)) == 0x81818200).
989       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
990       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
991       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
992       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
993
994       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond),
995       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond),
996
997       # float? -> float? -> floatS ==> float? -> floatS
998       (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
999
1000       # int? -> float? -> floatS ==> int? -> floatS
1001       (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
1002       (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
1003
1004       # float? -> float? -> intS ==> float? -> intS
1005       (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
1006       (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
1007
1008       # HLSL's sign function returns an integer
1009       (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)),
1010    ])
1011
1012    for B in [32, 64]:
1013        if s < B:
1014            optimizations.extend([
1015               # S = smaller, B = bigger
1016               # floatS -> floatB -> floatS ==> identity
1017               (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
1018
1019               # floatS -> floatB -> intB ==> floatS -> intB
1020               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
1021               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
1022
1023               # int? -> floatB -> floatS ==> int? -> floatS
1024               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
1025               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
1026            ])
1027
1028for S in [1, 8, 16, 32]:
1029    for B in [8, 16, 32, 64]:
1030        if B <= S:
1031            continue
1032        optimizations.extend([
1033            # intS -> intB -> intS ==> identity
1034            (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a),
1035            (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a),
1036        ])
1037
1038        if B < 16:
1039            continue
1040        for C in [8, 16, 32, 64]:
1041            if C <= S:
1042                continue
1043            optimizations.extend([
1044                # intS -> intC -> floatB ==> intS -> floatB
1045                (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)),
1046                (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)),
1047            ])
1048
1049# mediump variants of the above
1050optimizations.extend([
1051    # int32 -> float32 -> float16 ==> int32 -> float16
1052    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
1053    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
1054
1055    # float32 -> float16 -> int16 ==> float32 -> int16
1056    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
1057    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
1058
1059    # float32 -> int32 -> int16 ==> float32 -> int16
1060    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
1061    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
1062
1063    # int32 -> int16 -> float16 ==> int32 -> float16
1064    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
1065    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
1066])
1067
1068# Clean up junk left from 8-bit integer to 16-bit integer lowering.
1069optimizations.extend([
1070    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
1071    # accomplished by mask the upper 8-bit of the immediate operand to the
1072    # iand instruction.  Often times, both patterns will end up being applied
1073    # to the same original expression tree.
1074    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
1075    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
1076])
1077
1078for op in ['iand', 'ior', 'ixor']:
1079    optimizations.extend([
1080        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
1081        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
1082
1083        # Undistribute extract from a logic op
1084        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
1085        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
1086        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
1087        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
1088
1089        # Undistribute shifts from a logic op
1090        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
1091        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
1092        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
1093    ])
1094
1095# Integer sizes
1096for s in [8, 16, 32, 64]:
1097    last_shift_bit = int(math.log2(s)) - 1
1098
1099    optimizations.extend([
1100       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),
1101       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),
1102       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),
1103       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),
1104       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),
1105       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),
1106
1107       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
1108       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
1109
1110       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
1111       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
1112       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
1113       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
1114       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))),
1115    ])
1116
1117optimizations.extend([
1118   # Common pattern like 'if (i == 0 || i == 1 || ...)'
1119   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
1120   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
1121   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
1122   (('ior', a, ('ieq', a, False)), True),
1123
1124   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
1125   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
1126
1127   (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'),
1128    ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)),
1129              ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))),
1130              ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b)))
1131    )
1132   ),
1133
1134   (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)),
1135    ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'),
1136
1137   (('ior',  ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)),
1138   (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)),
1139
1140   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1141   # The first part of the iand comes from the !__feq64_nonnan.
1142   #
1143   # The second pattern is a reformulation of the first based on the relation
1144   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1145   # happens to be y == 0.
1146   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1147    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1148   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1149    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1150
1151   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1152   # transformations occur before constant propagation and loop-unrolling.
1153   #
1154   # The flt versions are exact.  If isnan(a), the original pattern is
1155   # trivially false, and the replacements are false too.  If isnan(b):
1156   #
1157   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1158   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1159   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1160   (('~fge', a, ('fmin', b, a)), True),
1161   (('~fge', ('fmax', a, b), a), True),
1162   (('flt', a, ('fmin', b, a)), False),
1163   (('flt', ('fmax', a, b), a), False),
1164   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1165   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1166
1167   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1168   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1169   (('ige', a, ('imin', b, a)), True),
1170   (('ige', ('imax', a, b), a), True),
1171   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1172   (('ult', ('umin', a, b), a), ('ult', b, a)),
1173   (('uge', a, ('umin', b, a)), True),
1174   (('uge', ('umax', a, b), a), True),
1175   (('ilt', a, ('imin', b, a)), False),
1176   (('ilt', ('imax', a, b), a), False),
1177   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1178   (('ige', ('imin', a, b), a), ('ige', b, a)),
1179   (('ult', a, ('umin', b, a)), False),
1180   (('ult', ('umax', a, b), a), False),
1181   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1182   (('uge', ('umin', a, b), a), ('uge', b, a)),
1183   (('ult', a, ('iand', b, a)), False),
1184   (('ult', ('ior', a, b), a), False),
1185   (('uge', a, ('iand', b, a)), True),
1186   (('uge', ('ior', a, b), a), True),
1187
1188   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1189   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1190   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1191   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1192   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1193   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1194   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1195   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1196   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1197   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1198   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1199   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1200   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1201   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1202   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1203   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1204
1205   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1206   # negative.
1207   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1208    ('iabs', ('ishr', a, b))),
1209   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
1210
1211   (('fabs', ('slt', a, b)), ('slt', a, b)),
1212   (('fabs', ('sge', a, b)), ('sge', a, b)),
1213   (('fabs', ('seq', a, b)), ('seq', a, b)),
1214   (('fabs', ('sne', a, b)), ('sne', a, b)),
1215   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1216   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1217   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1218   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1219   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1220   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1221   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1222   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1223   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1224   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1225   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1226   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1227   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1228   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1229   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1230   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1231   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1232   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1233   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1234   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1235   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1236   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1237   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1238   (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'),
1239   (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'),
1240   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1241   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1242   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1243   (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1244   (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1245])
1246
1247def vector_cmp(reduce_op, cmp_op, comps):
1248   if len(comps) == 1:
1249      return (cmp_op, 'a.' + comps[0], 'b.' + comps[0])
1250   else:
1251      mid = len(comps) // 2
1252      return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]),
1253                         vector_cmp(reduce_op, cmp_op, comps[mid:]))
1254
1255for op in [
1256   ('ball_iequal', 'ieq', 'iand'),
1257   ('ball_fequal', 'feq', 'iand'),
1258   ('bany_inequal', 'ine', 'ior'),
1259   ('bany_fnequal', 'fneu', 'ior'),
1260]:
1261   optimizations.extend([
1262      ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'),
1263      ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'),
1264      ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'),
1265      ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'),
1266      ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'),
1267   ])
1268
1269optimizations.extend([
1270   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
1271   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
1272   (('feq', ('slt', a, b), 1.0), ('flt', a, b)),
1273   (('feq', ('sge', a, b), 1.0), ('fge', a, b)),
1274   (('fneu', ('seq', a, b), 0.0), ('feq', a, b)),
1275   (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),
1276   (('fneu', ('slt', a, b), 0.0), ('flt', a, b)),
1277   (('fneu', ('sge', a, b), 0.0), ('fge', a, b)),
1278   (('feq', ('seq', a, b), 0.0), ('fneu', a, b)),
1279   (('feq', ('sne', a, b), 0.0), ('feq', a, b)),
1280   (('feq', ('slt', a, b), 0.0), ('fge', a, b)),
1281   (('feq', ('sge', a, b), 0.0), ('flt', a, b)),
1282   (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),
1283   (('fneu', ('sne', a, b), 1.0), ('feq', a, b)),
1284   (('fneu', ('slt', a, b), 1.0), ('fge', a, b)),
1285   (('fneu', ('sge', a, b), 1.0), ('flt', a, b)),
1286
1287   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1288   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1289   # Emulating booleans
1290   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1291   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1292   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1293   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1294   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1295   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1296   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1297   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1298   # Comparison with the same args.  Note that these are only done for the
1299   # float versions when the source must be a number.  Generally, NaN cmp NaN
1300   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1301   # is false, and, for any number X, X < X is also false.
1302   (('ilt', a, a), False),
1303   (('ige', a, a), True),
1304   (('ieq', a, a), True),
1305   (('ine', a, a), False),
1306   (('ult', a, a), False),
1307   (('uge', a, a), True),
1308   (('flt', a, a), False),
1309   (('fge', 'a(is_a_number)', a), True),
1310   (('feq', 'a(is_a_number)', a), True),
1311   (('fneu', 'a(is_a_number)', a), False),
1312   # Logical and bit operations
1313   (('iand', a, a), a),
1314   (('iand', a, 0), 0),
1315   (('iand', a, -1), a),
1316   (('iand', a, ('inot', a)), 0),
1317   (('ior', a, a), a),
1318   (('ior', a, 0), a),
1319   (('ior', a, -1), -1),
1320   (('ior', a, ('inot', a)), -1),
1321   (('ixor', a, a), 0),
1322   (('ixor', a, 0), a),
1323   (('ixor', a, ('ixor', a, b)), b),
1324   (('ixor', a, -1), ('inot', a)),
1325   (('inot', ('inot', a)), a),
1326   (('ior', ('iand', a, b), b), b),
1327   (('ior', ('ior', a, b), b), ('ior', a, b)),
1328   (('iand', ('ior', a, b), b), b),
1329   (('iand', ('iand', a, b), b), ('iand', a, b)),
1330
1331   # It is common for sequences of (x & 1) to occur in large trees.  Replacing
1332   # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "&
1333   # 1" to eventually bubble up to the top of the tree.
1334   (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)),
1335    ('iand', a, ('iand', b, c))),
1336
1337   (('iand@64', a, '#b(is_lower_half_zero)'),
1338    ('pack_64_2x32_split', 0,
1339                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1340     '!options->lower_pack_64_2x32_split'),
1341   (('iand@64', a, '#b(is_upper_half_zero)'),
1342    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1343                           0),
1344     '!options->lower_pack_64_2x32_split'),
1345   (('iand@64', a, '#b(is_lower_half_negative_one)'),
1346    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1347                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1348     '!options->lower_pack_64_2x32_split'),
1349   (('iand@64', a, '#b(is_upper_half_negative_one)'),
1350    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1351                           ('unpack_64_2x32_split_y', a)),
1352     '!options->lower_pack_64_2x32_split'),
1353
1354   (('ior@64', a, '#b(is_lower_half_zero)'),
1355    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1356                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1357     '!options->lower_pack_64_2x32_split'),
1358   (('ior@64', a, '#b(is_upper_half_zero)'),
1359    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1360                           ('unpack_64_2x32_split_y', a)),
1361     '!options->lower_pack_64_2x32_split'),
1362   (('ior@64', a, '#b(is_lower_half_negative_one)'),
1363    ('pack_64_2x32_split', -1,
1364                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1365     '!options->lower_pack_64_2x32_split'),
1366   (('ior@64', a, '#b(is_upper_half_negative_one)'),
1367    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1368                           -1),
1369     '!options->lower_pack_64_2x32_split'),
1370
1371   (('ixor@64', a, '#b(is_lower_half_zero)'),
1372    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1373                           ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1374     '!options->lower_pack_64_2x32_split'),
1375   (('ixor@64', a, '#b(is_upper_half_zero)'),
1376    ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1377                           ('unpack_64_2x32_split_y', a)),
1378     '!options->lower_pack_64_2x32_split'),
1379
1380   # DeMorgan's Laws
1381   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1382   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1383   # Shift optimizations
1384   (('ishl', 0, a), 0),
1385   (('ishl', a, 0), a),
1386   (('ishr', 0, a), 0),
1387   (('ishr', -1, a), -1),
1388   (('ishr', a, 0), a),
1389   (('ushr', 0, a), 0),
1390   (('ushr', a, 0), a),
1391   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'),
1392   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'),
1393   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'),
1394   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'),
1395   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'),
1396   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'),
1397   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'),
1398   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'),
1399   (('urol@8',  a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub',  8, b))), '!options->has_rotate8'),
1400   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'),
1401   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'),
1402   (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))),
1403   (('uror@8',  a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub',  8, b))), '!options->has_rotate8'),
1404   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'),
1405   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'),
1406   (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))),
1407
1408   # bfi(X, a, b) = (b & ~X) | (a & X)
1409   # If X = ~0: (b & 0) | (a & 0xffffffff) = a
1410   # If X = 0:  (b & 0xffffffff) | (a & 0) = b
1411   (('bfi', 0xffffffff, a, b), a),
1412   (('bfi', 0x00000000, a, b), b),
1413
1414   # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the
1415   # bfi is either b or c.
1416   (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)),
1417
1418   # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a)
1419   #              = (a & a) | (b & ~a)    If a is odd, find_lsb(a) == 0
1420   #              = a | (b & ~a)
1421   #              = a | b
1422   (('bfi', '#a(is_odd)', a, b), ('ior', a, b)),
1423
1424   # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a)
1425   #              = ((b << find_lsb(a)) & a)
1426   #              = (b & a)               If a is odd, find_lsb(a) == 0
1427   (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)),
1428
1429   # Because 'a' is a positive power of two, the result of the bfi is either 0
1430   # or 'a' depending on whether or not 'b' is odd.  Use 'b&1' for the zero
1431   # value to help platforms that can't have two constants in a bcsel.
1432   (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1433    ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))),
1434   (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1435    ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))),
1436
1437   # Exponential/logarithmic identities
1438   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1439   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1440   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1441   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1442   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1443    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1444   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1445   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1446   (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)),
1447   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1448   (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)),
1449   (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))),
1450   (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))),
1451   (('~fpow', a, 1.0), a),
1452   (('~fpow', a, 2.0), ('fmul', a, a)),
1453   (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)),
1454   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1455   (('~fpow', 2.0, a), ('fexp2', a)),
1456   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1457   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1458   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1459   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1460   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1461   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1462   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1463   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1464   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1465   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1466   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1467   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1468   (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
1469   # Division and reciprocal
1470   (('~fdiv', 1.0, a), ('frcp', a)),
1471   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1472   (('~frcp', ('frcp', a)), a),
1473   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1474   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1475   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1476   # Trig
1477   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1478   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1479   # Boolean simplifications
1480   (('ieq', a, True), a),
1481   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1482   (('ine', a, False), a),
1483   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1484   (('bcsel', a, True, False), a),
1485   (('bcsel', a, False, True), ('inot', a)),
1486   (('bcsel', True, b, c), b),
1487   (('bcsel', False, b, c), c),
1488
1489   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1490   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1491   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1492   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1493   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1494   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1495   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1496   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1497   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1498   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1499   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1500   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1501
1502   (('bcsel', a, b, b), b),
1503   (('~fcsel', a, b, b), b),
1504
1505   # D3D Boolean emulation
1506   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1507   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1508   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1509   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1510   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1511    ('ineg', ('b2i', ('iand', a, b)))),
1512   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1513    ('ineg', ('b2i', ('ior', a, b)))),
1514   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1515   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1516   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1517   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1518   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1519   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1520   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1521
1522   # With D3D booleans, imax is AND and umax is OR
1523   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1524    ('ineg', ('b2i', ('iand', a, b)))),
1525   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1526    ('ineg', ('b2i', ('ior', a, b)))),
1527   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1528    ('ineg', ('b2i', ('ior', a, b)))),
1529   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1530    ('ineg', ('b2i', ('iand', a, b)))),
1531   (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior',  a, b))),
1532   (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1533
1534   # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op.
1535   (('iand', ('b2i', a), 1), ('b2i', a)),
1536
1537   (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))),
1538   (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))),
1539
1540   # Conversions
1541   (('f2i', ('ftrunc', a)), ('f2i', a)),
1542   (('f2u', ('ftrunc', a)), ('f2u', a)),
1543
1544   # Conversions from 16 bits to 32 bits and back can always be removed
1545   (('f2fmp', ('f2f32', 'a@16')), a),
1546   (('i2imp', ('i2i32', 'a@16')), a),
1547   (('i2imp', ('u2u32', 'a@16')), a),
1548
1549   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1550   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1551   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1552   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1553
1554   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1555   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1556   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1557
1558   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1559   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1560   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1561   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1562
1563   # Conversions to 16 bits would be lossy so they should only be removed if
1564   # the instruction was generated by the precision lowering pass.
1565   (('f2f32', ('f2fmp', 'a@32')), a),
1566   (('i2i32', ('i2imp', 'a@32')), a),
1567   (('u2u32', ('i2imp', 'a@32')), a),
1568
1569   # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32
1570   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1571   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1572   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1573   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1574
1575   # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32
1576   (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)),
1577   (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)),
1578   (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)),
1579
1580   (('ffloor', 'a(is_integral)'), a),
1581   (('fceil', 'a(is_integral)'), a),
1582   (('ftrunc', 'a(is_integral)'), a),
1583   (('fround_even', 'a(is_integral)'), a),
1584
1585   # fract(x) = x - floor(x), so fract(NaN) = NaN
1586   (('~ffract', 'a(is_integral)'), 0.0),
1587   (('fabs', 'a(is_not_negative)'), a),
1588   (('iabs', 'a(is_not_negative)'), a),
1589   (('fsat', 'a(is_not_positive)'), 0.0),
1590
1591   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1592
1593   # The result of the multiply must be in [-1, 0], so the result of the ffma
1594   # must be in [0, 1].
1595   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1596   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1597   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1598   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1599
1600   (('fneu', 'a(is_not_zero)', 0.0), True),
1601   (('feq', 'a(is_not_zero)', 0.0), False),
1602
1603   # In this chart, + means value > 0 and - means value < 0.
1604   #
1605   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1606   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1607   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1608   #
1609   # Using grouping conceptually similar to a Karnaugh map...
1610   #
1611   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1612   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1613   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1614   #
1615   # The flt / ilt cases just invert the expected result.
1616   #
1617   # The results expecting true, must be marked imprecise.  The results
1618   # expecting false are fine because NaN compared >= or < anything is false.
1619
1620   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1621   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1622   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1623
1624   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1625   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1626   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1627
1628   (('ine', 'a(is_not_zero)', 0), True),
1629   (('ieq', 'a(is_not_zero)', 0), False),
1630
1631   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1632   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1633   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1634
1635   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1636   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1637   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1638
1639   (('ult', 0, 'a(is_gt_zero)'), True),
1640   (('ult', a, 0), False),
1641
1642   # Packing and then unpacking does nothing
1643   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1644   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1645   (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'),
1646   (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'),
1647   (('unpack_64_2x32_split_x', ('u2u64', 'a@32')), a),
1648   (('unpack_64_2x32_split_y', ('u2u64', a)), 0),
1649   (('unpack_64_2x32_split_x', ('i2i64', 'a@32')), a),
1650   (('unpack_64_2x32_split_y', ('i2i64(is_used_once)', 'a@32')), ('ishr', a, 31)),
1651   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1652   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1653   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1654   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1655                           ('unpack_64_2x32_split_y', a)), a),
1656   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1657                              ('unpack_64_2x32_split_y', a))), a),
1658   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1659   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1660
1661   (('unpack_64_4x16', ('pack_64_4x16', a)), a),
1662   (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)),
1663   (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)),
1664
1665   # Comparing two halves of an unpack separately.  While this optimization
1666   # should be correct for non-constant values, it's less obvious that it's
1667   # useful in that case.  For constant values, the pack will fold and we're
1668   # guaranteed to reduce the whole tree to one instruction.
1669   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1670             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1671    ('ieq', a, ('pack_32_2x16_split', b, c))),
1672
1673   # Byte extraction
1674   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1675   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1676   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1677   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1678   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1679   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1680   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1681
1682   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1683   # storage buffer.
1684   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1685   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1686
1687   # Common pattern after lowering 8-bit integers to 16-bit.
1688   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1689   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1690
1691   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1692   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1693   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1694   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1695   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1696   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1697   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1698   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1699
1700   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1701   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1702
1703    # Word extraction
1704   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1705   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1706   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1707   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1708   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1709
1710   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1711   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1712   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1713   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1714
1715   # Packing a u8vec4 to write to an SSBO.
1716   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1717    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1718
1719   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1720   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1721
1722   # Lower pack/unpack
1723   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1724   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'),
1725   (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'),
1726   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1727   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1728   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1729   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1730
1731   (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1732   (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1733
1734   # Useless masking before unpacking
1735   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1736   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1737   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1738   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1739   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1740   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1741
1742   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1743   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1744   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1745   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1746   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1747
1748   # Optimize half packing
1749   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1750   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1751
1752   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1753    ('pack_half_2x16', ('vec2', a, b))),
1754   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1755    ('pack_half_2x16', ('vec2', a, b))),
1756
1757   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1758   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1759   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1760
1761   (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)),
1762   (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)),
1763   (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)),
1764
1765   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1766   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1767
1768   (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
1769   (('ior',  ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
1770
1771   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
1772   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
1773   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
1774   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
1775   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
1776   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
1777   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
1778   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
1779
1780   # Reduce intermediate precision with int64.
1781   (('u2u32', ('iadd(is_used_once)', 'a@64', b)),
1782    ('iadd', ('u2u32', a), ('u2u32', b))),
1783])
1784
1785# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1786# patterns like those below.
1787for op in ('ushr', 'ishr'):
1788   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1789   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1790   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1791
1792optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1793
1794# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1795# patterns like those below.
1796for op in ('extract_u8', 'extract_i8'):
1797   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1798   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1799   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1800
1801optimizations.extend([
1802   # Subtracts
1803   (('ussub_4x8_vc4', a, 0), a),
1804   (('ussub_4x8_vc4', a, ~0), 0),
1805   # Lower all Subtractions first - they can get recombined later
1806   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1807   (('isub', a, b), ('iadd', a, ('ineg', b))),
1808   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1809   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1810   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1811   (('bitz', a, b), ('inot', ('bitnz', a, b))),
1812
1813   # Propagate negation up multiplication chains
1814   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1815   (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_preserve_32),
1816   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
1817   (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
1818   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1819
1820   # Propagate constants up multiplication chains
1821   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1822   (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
1823   (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
1824   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1825   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
1826   (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
1827   (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
1828   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1829   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1830   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1831   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
1832   (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
1833   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1834
1835   # Reassociate constants in add/mul chains so they can be folded together.
1836   # For now, we mostly only handle cases where the constants are separated by
1837   # a single non-constant.  We could do better eventually.
1838   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1839   (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
1840   (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
1841   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
1842   (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
1843   (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
1844   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1845   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1846   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
1847   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
1848   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1849   (('~fadd', '#a',          ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffmaz',          b,  c, ('fadd', a,          d))),
1850   (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1851   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
1852   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
1853   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
1854   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
1855
1856   # Reassociate add chains for more MAD/FMA-friendly code
1857   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
1858
1859   # Drop mul-div by the same value when there's no wrapping.
1860   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
1861
1862   # By definition...
1863   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
1864   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1865   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1866   (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
1867   (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
1868
1869   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
1870   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1871   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1872   (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
1873   (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
1874
1875   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
1876   (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
1877
1878   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
1879   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1880   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
1881   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1882   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
1883   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1884   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
1885   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1886   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1887   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1888   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1889   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1890
1891   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'),
1892   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
1893   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
1894   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
1895   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'),
1896   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
1897   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
1898   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
1899   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
1900   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
1901   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
1902   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
1903
1904   # This is safe. Both ufind_msb_rev and bitfield_reverse can only have
1905   # 32-bit sources, so the transformation can only generate correct NIR.
1906   (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
1907   (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'),
1908
1909   (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))),
1910
1911   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1912   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1913   (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1914   (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1915   (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)),
1916   (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)),
1917   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
1918
1919   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
1920   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
1921
1922   # Misc. lowering
1923   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
1924   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
1925   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
1926   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
1927
1928   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1929    ('bcsel', ('ult', 31, 'bits'), 'insert',
1930              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
1931    'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'),
1932   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1933   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1934   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1935   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1936   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1937   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1938   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1939   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1940
1941   (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'),
1942   (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'),
1943
1944   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1945   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
1946   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'),
1947   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),
1948
1949   # int64_t sum = a + b;
1950   #
1951   # if (a < 0 && b < 0 && a < sum)
1952   #    sum = INT64_MIN;
1953   # } else if (a >= 0 && b >= 0 && sum < a)
1954   #    sum = INT64_MAX;
1955   # }
1956   #
1957   # A couple optimizations are applied.
1958   #
1959   # 1. a < sum => sum >= 0.  This replacement works because it is known that
1960   #    a < 0 and b < 0, so sum should also be < 0 unless there was
1961   #    underflow.
1962   #
1963   # 2. sum < a => sum < 0.  This replacement works because it is known that
1964   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
1965   #    overflow.
1966   #
1967   # 3. Invert the second if-condition and swap the order of parameters for
1968   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
1969   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
1970   #
1971   # On Intel Gen11, this saves ~11 instructions.
1972   (('iadd_sat@64', a, b), ('bcsel',
1973                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1974                            0x8000000000000000,
1975                            ('bcsel',
1976                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1977                             ('iadd', a, b),
1978                             0x7fffffffffffffff)),
1979    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1980
1981   # int64_t sum = a - b;
1982   #
1983   # if (a < 0 && b >= 0 && a < sum)
1984   #    sum = INT64_MIN;
1985   # } else if (a >= 0 && b < 0 && a >= sum)
1986   #    sum = INT64_MAX;
1987   # }
1988   #
1989   # Optimizations similar to the iadd_sat case are applied here.
1990   (('isub_sat@64', a, b), ('bcsel',
1991                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1992                            0x8000000000000000,
1993                            ('bcsel',
1994                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1995                             ('isub', a, b),
1996                             0x7fffffffffffffff)),
1997    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1998
1999   # These are done here instead of in the backend because the int64 lowering
2000   # pass will make a mess of the patterns.  The first patterns are
2001   # conditioned on nir_lower_minmax64 because it was not clear that it was
2002   # always an improvement on platforms that have real int64 support.  No
2003   # shaders in shader-db hit this, so it was hard to say one way or the
2004   # other.
2005   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2006   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2007   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2008   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2009   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2010   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2011
2012   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2013   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2014   # 0u < uint(a) <=> uint(a) != 0u
2015   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2016
2017   # Alternative lowering that doesn't rely on bfi.
2018   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2019    ('bcsel', ('ult', 31, 'bits'),
2020     'insert',
2021    (('ior',
2022     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
2023     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
2024    'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'),
2025
2026   # Alternative lowering that uses bitfield_select.
2027   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2028    ('bcsel', ('ult', 31, 'bits'), 'insert',
2029              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
2030    'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'),
2031
2032   (('ibitfield_extract', 'value', 'offset', 'bits'),
2033    ('bcsel', ('ult', 31, 'bits'), 'value',
2034              ('ibfe', 'value', 'offset', 'bits')),
2035    'options->lower_bitfield_extract && options->has_bfe'),
2036
2037   (('ubitfield_extract', 'value', 'offset', 'bits'),
2038    ('bcsel', ('ult', 31, 'bits'), 'value',
2039              ('ubfe', 'value', 'offset', 'bits')),
2040    'options->lower_bitfield_extract && options->has_bfe'),
2041
2042   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
2043   (('bitfield_select', a, b, 0), ('iand', a, b)),
2044   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
2045
2046   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
2047   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
2048   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
2049   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
2050   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
2051   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
2052   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
2053
2054   # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such
2055   (('ult', a, ('umin', ('iand', a, b), c)), False),
2056   (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False),
2057   (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2058    ('ubfe', 'value', 'offset', 'width')),
2059   (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2060    ('ibfe', 'value', 'offset', 'width')),
2061   (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'),
2062    ('bfm', 'width', 'offset')),
2063
2064   # open-coded BFM
2065   (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'),
2066   (('ishl', ('bfm', a, 0), b), ('bfm', a, b)),
2067
2068   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2069   #
2070   #    If bits is zero, the result will be zero.
2071   #
2072   # These patterns prevent other patterns from generating invalid results
2073   # when count is zero.
2074   (('ubfe', a, b, 0), 0),
2075   (('ibfe', a, b, 0), 0),
2076
2077   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
2078
2079   (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)),
2080   (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
2081   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2082   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2083   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2084   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2085
2086   (('ibitfield_extract', 'value', 'offset', 'bits'),
2087    ('bcsel', ('ieq', 0, 'bits'),
2088     0,
2089     ('ishr',
2090       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
2091       ('isub', 32, 'bits'))),
2092    'options->lower_bitfield_extract && !options->has_bfe'),
2093
2094   (('ubitfield_extract', 'value', 'offset', 'bits'),
2095    ('iand',
2096     ('ushr', 'value', 'offset'),
2097     ('bcsel', ('ieq', 'bits', 32),
2098      0xffffffff,
2099      ('isub', ('ishl', 1, 'bits'), 1))),
2100    'options->lower_bitfield_extract && !options->has_bfe'),
2101
2102   (('ifind_msb', 'value'),
2103    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
2104    'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'),
2105
2106   (('ifind_msb', 'value'),
2107    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
2108     ('isub', 31, ('ifind_msb_rev', 'value')),
2109     ('ifind_msb_rev', 'value')),
2110    'options->lower_ifind_msb && options->has_find_msb_rev'),
2111
2112   # uclz of an absolute value source almost always does the right thing.
2113   # There are a couple problem values:
2114   #
2115   # * 0x80000000.  Since abs(0x80000000) == 0x80000000, uclz returns 0.
2116   #   However, findMSB(int(0x80000000)) == 30.
2117   #
2118   # * 0xffffffff.  Since abs(0xffffffff) == 1, uclz returns 31.  Section 8.8
2119   #   (Integer Functions) of the GLSL 4.50 spec says:
2120   #
2121   #    For a value of zero or negative one, -1 will be returned.
2122   #
2123   # * Negative powers of two.  uclz(abs(-(1<<x))) returns x, but
2124   #   findMSB(-(1<<x)) should return x-1.
2125   #
2126   # For all negative number cases, including 0x80000000 and 0xffffffff, the
2127   # correct value is obtained from uclz if instead of negating the (already
2128   # negative) value the logical-not is used.  A conditional logical-not can
2129   # be achieved by (x ^ (x >> 31)).
2130   (('ifind_msb', 'value'),
2131    ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))),
2132    'options->lower_ifind_msb && options->has_uclz'),
2133
2134   (('ufind_msb', 'value@32'),
2135    ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
2136     ('isub', 31, ('ufind_msb_rev', 'value')),
2137     ('ufind_msb_rev', 'value')),
2138    'options->lower_ufind_msb && options->has_find_msb_rev'),
2139
2140   (('ufind_msb', 'value@32'),
2141    ('isub', 31, ('uclz', 'value')),
2142    'options->lower_ufind_msb && options->has_uclz'),
2143
2144   (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'),
2145
2146   (('find_lsb', 'value@64'),
2147    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
2148    'options->lower_find_lsb'),
2149
2150   (('find_lsb', 'value'),
2151    ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))),
2152    'options->lower_find_lsb'),
2153
2154   (('extract_i8', a, 'b@32'),
2155    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
2156    'options->lower_extract_byte'),
2157
2158   (('extract_u8', a, 'b@32'),
2159    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
2160    'options->lower_extract_byte'),
2161
2162   (('extract_i16', a, 'b@32'),
2163    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
2164    'options->lower_extract_word'),
2165
2166   (('extract_u16', a, 'b@32'),
2167    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
2168    'options->lower_extract_word'),
2169
2170    (('pack_unorm_2x16', 'v'),
2171     ('pack_uvec2_to_uint',
2172        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
2173     'options->lower_pack_unorm_2x16'),
2174
2175    (('pack_unorm_4x8', 'v'),
2176     ('pack_uvec4_to_uint',
2177        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
2178     'options->lower_pack_unorm_4x8'),
2179
2180    (('pack_snorm_2x16', 'v'),
2181     ('pack_uvec2_to_uint',
2182        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
2183     'options->lower_pack_snorm_2x16'),
2184
2185    (('pack_snorm_4x8', 'v'),
2186     ('pack_uvec4_to_uint',
2187        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
2188     'options->lower_pack_snorm_4x8'),
2189
2190    (('unpack_unorm_2x16', 'v'),
2191     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
2192                                  ('extract_u16', 'v', 1))),
2193              65535.0),
2194     'options->lower_unpack_unorm_2x16'),
2195
2196    (('unpack_unorm_4x8', 'v'),
2197     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
2198                                  ('extract_u8', 'v', 1),
2199                                  ('extract_u8', 'v', 2),
2200                                  ('extract_u8', 'v', 3))),
2201              255.0),
2202     'options->lower_unpack_unorm_4x8'),
2203
2204    (('unpack_snorm_2x16', 'v'),
2205     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
2206                                                            ('extract_i16', 'v', 1))),
2207                                           32767.0))),
2208     'options->lower_unpack_snorm_2x16'),
2209
2210    (('unpack_snorm_4x8', 'v'),
2211     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
2212                                                            ('extract_i8', 'v', 1),
2213                                                            ('extract_i8', 'v', 2),
2214                                                            ('extract_i8', 'v', 3))),
2215                                           127.0))),
2216     'options->lower_unpack_snorm_4x8'),
2217
2218   (('pack_half_2x16_split', 'a@32', 'b@32'),
2219    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
2220    'options->lower_pack_split'),
2221
2222   (('unpack_half_2x16_split_x', 'a@32'),
2223    ('f2f32', ('u2u16', a)),
2224    'options->lower_pack_split'),
2225
2226   (('unpack_half_2x16_split_y', 'a@32'),
2227    ('f2f32', ('u2u16', ('ushr', a, 16))),
2228    'options->lower_pack_split'),
2229
2230   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
2231   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
2232   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
2233   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
2234   # Mark the new comparisons precise to prevent them being changed to 'a !=
2235   # 0' or 'a == 0'.
2236   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
2237   (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'),
2238
2239   # Address/offset calculations:
2240   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
2241   # rule converts everyone else to imul:
2242   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
2243
2244   (('umul24', a, b),
2245    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
2246    '!options->has_umul24'),
2247   (('umad24', a, b, c),
2248    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
2249    '!options->has_umad24'),
2250
2251   # Relaxed 24bit ops
2252   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
2253   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
2254   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
2255   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
2256   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
2257   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
2258
2259   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
2260   (('imad24_ir3', a, 0, c), (c)),
2261   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
2262
2263   # if first two srcs are const, crack apart the imad so constant folding
2264   # can clean up the imul:
2265   # TODO ffma should probably get a similar rule:
2266   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
2267
2268   # These will turn 24b address/offset calc back into 32b shifts, but
2269   # it should be safe to get back some of the bits of precision that we
2270   # already decided were no necessary:
2271   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
2272   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
2273   (('imul24', a, 0), (0)),
2274])
2275
2276for bit_size in [8, 16, 32, 64]:
2277   cond = '!options->lower_uadd_sat'
2278   if bit_size == 64:
2279      cond += ' && !(options->lower_int64_options & nir_lower_iadd64)'
2280   add = 'iadd@' + str(bit_size)
2281
2282   optimizations += [
2283      (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2284      (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond),
2285      (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond),
2286      (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2287   ]
2288
2289for bit_size in [8, 16, 32, 64]:
2290   cond = '!options->lower_usub_sat'
2291   if bit_size == 64:
2292      cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)'
2293   add = 'iadd@' + str(bit_size)
2294
2295   optimizations += [
2296      (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2297      (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2298      (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2299      (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2300   ]
2301
2302# bit_size dependent lowerings
2303for bit_size in [8, 16, 32, 64]:
2304   # convenience constants
2305   intmax = (1 << (bit_size - 1)) - 1
2306   intmin = 1 << (bit_size - 1)
2307
2308   optimizations += [
2309      (('iadd_sat@' + str(bit_size), a, b),
2310       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
2311                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
2312      (('isub_sat@' + str(bit_size), a, b),
2313       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
2314                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
2315   ]
2316
2317invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
2318
2319for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
2320   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
2321                         ('iand', (invert[left], a, b), (invert[right], c, d))))
2322   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
2323                         ('ior', (invert[left], a, b), (invert[right], c, d))))
2324
2325# Optimize x2yN(b2x(x)) -> b2y
2326for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
2327   if x != 'f' and y != 'f' and x != y:
2328      continue
2329
2330   b2x = 'b2f' if x == 'f' else 'b2i'
2331   b2y = 'b2f' if y == 'f' else 'b2i'
2332   x2yN = '{}2{}'.format(x, y)
2333   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
2334
2335# Optimize away x2xN(a@N)
2336for t in ['int', 'uint', 'float', 'bool']:
2337   for N in type_sizes(t):
2338      x2xN = '{0}2{0}{1}'.format(t[0], N)
2339      aN = 'a@{0}'.format(N)
2340      optimizations.append(((x2xN, aN), a))
2341
2342# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
2343# In particular, we can optimize away everything except upcast of downcast and
2344# upcasts where the type differs from the other cast
2345for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
2346   if N < M:
2347      # The outer cast is a down-cast.  It doesn't matter what the size of the
2348      # argument of the inner cast is because we'll never been in the upcast
2349      # of downcast case.  Regardless of types, we'll always end up with y2yN
2350      # in the end.
2351      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
2352         x2xN = '{0}2{0}{1}'.format(x, N)
2353         y2yM = '{0}2{0}{1}'.format(y, M)
2354         y2yN = '{0}2{0}{1}'.format(y, N)
2355         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
2356   elif N > M:
2357      # If the outer cast is an up-cast, we have to be more careful about the
2358      # size of the argument of the inner cast and with types.  In this case,
2359      # the type is always the type of type up-cast which is given by the
2360      # outer cast.
2361      for P in type_sizes('uint'):
2362         # We can't optimize away up-cast of down-cast.
2363         if M < P:
2364            continue
2365
2366         # Because we're doing down-cast of down-cast, the types always have
2367         # to match between the two casts
2368         for x in ['i', 'u']:
2369            x2xN = '{0}2{0}{1}'.format(x, N)
2370            x2xM = '{0}2{0}{1}'.format(x, M)
2371            aP = 'a@{0}'.format(P)
2372            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
2373   else:
2374      # The N == M case is handled by other optimizations
2375      pass
2376
2377# Downcast operations should be able to see through pack
2378for t in ['i', 'u']:
2379    for N in [8, 16, 32]:
2380        x2xN = '{0}2{0}{1}'.format(t, N)
2381        optimizations += [
2382            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2383            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2384        ]
2385
2386# Optimize comparisons with up-casts
2387for t in ['int', 'uint', 'float']:
2388    for N, M in itertools.product(type_sizes(t), repeat=2):
2389        if N == 1 or N >= M:
2390            continue
2391
2392        cond = 'true'
2393        if N == 8:
2394            cond = 'options->support_8bit_alu'
2395        elif N == 16:
2396            cond = 'options->support_16bit_alu'
2397        x2xM = '{0}2{0}{1}'.format(t[0], M)
2398        x2xN = '{0}2{0}{1}'.format(t[0], N)
2399        aN = 'a@' + str(N)
2400        bN = 'b@' + str(N)
2401        xeq = 'feq' if t == 'float' else 'ieq'
2402        xne = 'fneu' if t == 'float' else 'ine'
2403        xge = '{0}ge'.format(t[0])
2404        xlt = '{0}lt'.format(t[0])
2405
2406        # Up-casts are lossless so for correctly signed comparisons of
2407        # up-casted values we can do the comparison at the largest of the two
2408        # original sizes and drop one or both of the casts.  (We have
2409        # optimizations to drop the no-op casts which this may generate.)
2410        for P in type_sizes(t):
2411            if P == 1 or P > N:
2412                continue
2413
2414            bP = 'b@' + str(P)
2415            optimizations += [
2416                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2417                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2418                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2419                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2420                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2421                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2422            ]
2423
2424        # The next bit doesn't work on floats because the range checks would
2425        # get way too complicated.
2426        if t in ['int', 'uint']:
2427            if t == 'int':
2428                xN_min = -(1 << (N - 1))
2429                xN_max = (1 << (N - 1)) - 1
2430            elif t == 'uint':
2431                xN_min = 0
2432                xN_max = (1 << N) - 1
2433            else:
2434                assert False
2435
2436            # If we're up-casting and comparing to a constant, we can unfold
2437            # the comparison into a comparison with the shrunk down constant
2438            # and a check that the constant fits in the smaller bit size.
2439            optimizations += [
2440                ((xeq, (x2xM, aN), '#b'),
2441                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2442                ((xne, (x2xM, aN), '#b'),
2443                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2444                ((xlt, (x2xM, aN), '#b'),
2445                 ('iand', (xlt, xN_min, b),
2446                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2447                ((xlt, '#a', (x2xM, bN)),
2448                 ('iand', (xlt, a, xN_max),
2449                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2450                ((xge, (x2xM, aN), '#b'),
2451                 ('iand', (xge, xN_max, b),
2452                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2453                ((xge, '#a', (x2xM, bN)),
2454                 ('iand', (xge, a, xN_min),
2455                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2456            ]
2457
2458# Convert masking followed by signed downcast to just unsigned downcast
2459optimizations += [
2460    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2461    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2462    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2463    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2464    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2465    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2466]
2467
2468# Some operations such as iadd have the property that the bottom N bits of the
2469# output only depends on the bottom N bits of each of the inputs so we can
2470# remove casts
2471for N in [16, 32]:
2472    for M in [8, 16]:
2473        if M >= N:
2474            continue
2475
2476        aN = 'a@' + str(N)
2477        u2uM = 'u2u{0}'.format(M)
2478        i2iM = 'i2i{0}'.format(M)
2479
2480        for x in ['u', 'i']:
2481            x2xN = '{0}2{0}{1}'.format(x, N)
2482            extract_xM = 'extract_{0}{1}'.format(x, M)
2483
2484            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2485            extract_xM_M_bits = \
2486                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2487            optimizations += [
2488                ((x2xN_M_bits, (u2uM, aN)), a),
2489                ((extract_xM_M_bits, aN, 0), a),
2490            ]
2491
2492            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2493            optimizations += [
2494                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2495                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2496                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2497            ]
2498
2499            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2500                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2501                optimizations += [
2502                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2503                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2504                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2505                ]
2506
2507def fexp2i(exp, bits):
2508   # Generate an expression which constructs value 2.0^exp or 0.0.
2509   #
2510   # We assume that exp is already in a valid range:
2511   #
2512   #   * [-15, 15] for 16-bit float
2513   #   * [-127, 127] for 32-bit float
2514   #   * [-1023, 1023] for 16-bit float
2515   #
2516   # If exp is the lowest value in the valid range, a value of 0.0 is
2517   # constructed.  Otherwise, the value 2.0^exp is constructed.
2518   if bits == 16:
2519      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2520   elif bits == 32:
2521      return ('ishl', ('iadd', exp, 127), 23)
2522   elif bits == 64:
2523      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2524   else:
2525      assert False
2526
2527def ldexp(f, exp, bits):
2528   # The maximum possible range for a normal exponent is [-126, 127] and,
2529   # throwing in denormals, you get a maximum range of [-149, 127].  This
2530   # means that we can potentially have a swing of +-276.  If you start with
2531   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2532   # all the way to zero.  The GLSL spec only requires that we handle a subset
2533   # of this range.  From version 4.60 of the spec:
2534   #
2535   #    "If exp is greater than +128 (single-precision) or +1024
2536   #    (double-precision), the value returned is undefined. If exp is less
2537   #    than -126 (single-precision) or -1022 (double-precision), the value
2538   #    returned may be flushed to zero. Additionally, splitting the value
2539   #    into a significand and exponent using frexp() and then reconstructing
2540   #    a floating-point value using ldexp() should yield the original input
2541   #    for zero and all finite non-denormalized values."
2542   #
2543   # The SPIR-V spec has similar language.
2544   #
2545   # In order to handle the maximum value +128 using the fexp2i() helper
2546   # above, we have to split the exponent in half and do two multiply
2547   # operations.
2548   #
2549   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2550   # twice the full range that is valid for the fexp2i() function above.  If
2551   # exp/2 is the bottom value of that range, the fexp2i() expression will
2552   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2553   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2554   # value is clamped from above, then it must have been above the supported
2555   # range of the GLSL built-in and therefore any return value is acceptable.
2556   if bits == 16:
2557      exp = ('imin', ('imax', exp, -30), 30)
2558   elif bits == 32:
2559      exp = ('imin', ('imax', exp, -254), 254)
2560   elif bits == 64:
2561      exp = ('imin', ('imax', exp, -2046), 2046)
2562   else:
2563      assert False
2564
2565   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2566   # (We use ishr which isn't the same for -1, but the -1 case still works
2567   # since we use exp-exp/2 as the second exponent.)  While the spec
2568   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2569   # work with denormals and doesn't allow for the full swing in exponents
2570   # that you can get with normalized values.  Instead, we create two powers
2571   # of two and multiply by them each in turn.  That way the effective range
2572   # of our exponent is doubled.
2573   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2574   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2575   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2576
2577optimizations += [
2578   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2579   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2580   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2581]
2582
2583# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2584def bitfield_reverse_ue4(u):
2585    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2586    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2587    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2588    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2589    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2590
2591    return step5
2592
2593# Cyberpunk 2077 open-codes bitfieldReverse()
2594def bitfield_reverse_cp2077(u):
2595    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2596    step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2597    step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2598    step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2599    step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2600
2601    return step5
2602
2603optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2604optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2605
2606# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16.
2607# Because the spec does not specify a rounding mode or behaviour regarding infinity,
2608# it emits a sequence to ensure D3D-like behaviour for infinity.
2609# When we know the current backend already behaves like we need, we can eliminate the extra sequence.
2610#
2611# Input is f32, output is u32 that has the f16 packed into its low bits.
2612def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a):
2613    packed_half = ('pack_half_2x16_rtz_split', a, 0)
2614    packed_half_minus1 = ('iadd', packed_half, 0xffffffff)
2615    f32_was_not_inf = ('ine', abs_a, 0x7f800000)
2616    f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00)
2617    return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half)
2618
2619optimizations += [
2620   (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)),
2621   (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)),
2622   (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)),
2623]
2624
2625def vkd3d_proton_msad():
2626   pattern = None
2627   for i in range(4):
2628      ref = ('extract_u8', 'a@32', i)
2629      src = ('extract_u8', 'b@32', i)
2630      sad = ('iabs', ('iadd', ref, ('ineg', src)))
2631      msad = ('bcsel', ('ieq', ref, 0), 0, sad)
2632      if pattern == None:
2633         pattern = msad
2634      else:
2635         pattern = ('iadd', pattern, msad)
2636   pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:])
2637   return pattern
2638
2639optimizations += [
2640   (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'),
2641   (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)),
2642]
2643
2644
2645# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2646# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2647for ncomp in [2, 3, 4, 8, 16]:
2648   optimizations += [
2649      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2650      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2651      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2652      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2653   ]
2654
2655# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2656# then the "a == a" is redundant because it's equivalent to "a is not NaN"
2657# and, if a is a NaN then the second comparison will fail anyway.
2658for op in ['flt', 'fge', 'feq']:
2659   optimizations += [
2660      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
2661      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
2662   ]
2663
2664# Add optimizations to handle the case where the result of a ternary is
2665# compared to a constant.  This way we can take things like
2666#
2667# (a ? 0 : 1) > 0
2668#
2669# and turn it into
2670#
2671# a ? (0 > 0) : (1 > 0)
2672#
2673# which constant folding will eat for lunch.  The resulting ternary will
2674# further get cleaned up by the boolean reductions above and we will be
2675# left with just the original variable "a".
2676for op in ['feq', 'fneu', 'ieq', 'ine']:
2677   optimizations += [
2678      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2679       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2680   ]
2681
2682for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
2683   optimizations += [
2684      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2685       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2686      ((op, '#d', ('bcsel', a, '#b', '#c')),
2687       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
2688   ]
2689
2690
2691# For example, this converts things like
2692#
2693#    1 + mix(0, a - 1, condition)
2694#
2695# into
2696#
2697#    mix(1, (a-1)+1, condition)
2698#
2699# Other optimizations will rearrange the constants.
2700for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
2701   optimizations += [
2702      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
2703   ]
2704
2705# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
2706# states:
2707#
2708#     If neither layout qualifier is specified, derivatives in compute shaders
2709#     return zero, which is consistent with the handling of built-in texture
2710#     functions like texture() in GLSL 4.50 compute shaders.
2711for op in ['fddx', 'fddx_fine', 'fddx_coarse',
2712           'fddy', 'fddy_fine', 'fddy_coarse']:
2713   optimizations += [
2714      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
2715]
2716
2717# Some optimizations for ir3-specific instructions.
2718optimizations += [
2719   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
2720   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
2721   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
2722   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
2723   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
2724]
2725
2726# These kinds of sequences can occur after nir_opt_peephole_select.
2727#
2728# NOTE: fadd is not handled here because that gets in the way of ffma
2729# generation in the i965 driver.  Instead, fadd and ffma are handled in
2730# late_optimizations.
2731
2732for op in ['flrp']:
2733    optimizations += [
2734        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2735        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2736        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2737        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2738        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2739        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2740    ]
2741
2742for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
2743    optimizations += [
2744        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2745        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2746        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2747        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2748    ]
2749
2750for op in ['fpow']:
2751    optimizations += [
2752        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2753        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2754        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
2755        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
2756    ]
2757
2758for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']:
2759    optimizations += [
2760        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
2761    ]
2762
2763for op in ['ineg', 'iabs', 'inot', 'isign']:
2764    optimizations += [
2765        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
2766    ]
2767
2768optimizations.extend([
2769    (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'),
2770    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'),
2771    (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
2772    ])
2773
2774
2775"""
2776  if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16)
2777     return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */;
2778  else
2779     return f2f32(f2f16(val));
2780"""
2781optimizations.extend([
2782    (('fquantize2f16', 'a@32'),
2783     ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
2784               ('iand', a, 1 << 31),
2785               ('!f2f32', ('!f2f16_rtne', a))),
2786     'options->lower_fquantize2f16')
2787    ])
2788
2789for s in range(0, 31):
2790    mask = 0xffffffff << s
2791
2792    # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior
2793    # will never both have the same bits set, replacing the ior with an iadd
2794    # is safe (i.e., a carry out of a bit can never be generated). The iadd is
2795    # more likely to participate in other optimization patterns (e.g., iadd of
2796    # constant reassociation)
2797    optimizations.extend([
2798        (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)),
2799         'options->avoid_ternary_with_two_constants'),
2800    ])
2801
2802# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN.
2803# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here)
2804for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']:
2805    optimizations += [((op, '#a(is_nan)', b), NAN)]
2806    optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative
2807
2808# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN.
2809for op in ['ffma', 'flrp']:
2810    optimizations += [((op, '#a(is_nan)', b, c), NAN)]
2811    optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative
2812    optimizations += [((op, a, b, '#c(is_nan)'), NAN)]
2813
2814# NaN propagation: FP min/max. Pick the non-NaN operand.
2815for op in ['fmin', 'fmax']:
2816    optimizations += [((op, '#a(is_nan)', b), b)] # commutative
2817
2818# NaN propagation: ldexp is NaN if the first operand is NaN.
2819optimizations += [(('ldexp', '#a(is_nan)', b), NAN)]
2820
2821# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN.
2822for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']:
2823    optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative
2824
2825# NaN propagation: FP comparison opcodes except !=. Replace it with false.
2826for op in ['feq', 'fge', 'flt']:
2827    optimizations += [((op, '#a(is_nan)', b), False)]
2828    optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative
2829
2830# NaN propagation: FP comparison opcodes using !=. Replace it with true.
2831# Operator != is the only opcode where a comparison with NaN returns true.
2832for op in ['fneu']:
2833    optimizations += [((op, '#a(is_nan)', b), True)] # commutative
2834
2835# NaN propagation: FP comparison opcodes except != returning FP 0 or 1.
2836for op in ['seq', 'sge', 'slt']:
2837    optimizations += [((op, '#a(is_nan)', b), 0.0)]
2838    optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative
2839
2840# NaN propagation: FP comparison opcodes using != returning FP 0 or 1.
2841# Operator != is the only opcode where a comparison with NaN returns true.
2842optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative
2843
2844# This section contains optimizations to propagate downsizing conversions of
2845# constructed vectors into vectors of downsized components. Whether this is
2846# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
2847# this reduces the register pressure of the vector itself and often enables the
2848# conversions to be eliminated via other algebraic rules or constant folding.
2849# In the worst case on a SIMD architecture, the propagated conversions may be
2850# revectorized via nir_opt_vectorize so instruction count is minimally
2851# impacted.
2852#
2853# On a machine with SIMD-within-a-register only, this actually
2854# counterintuitively hurts instruction count. These machines are the same that
2855# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
2856# not being set.
2857#
2858# Finally for scalar architectures, there should be no difference in generated
2859# code since it all ends up scalarized at the end, but it might minimally help
2860# compile-times.
2861
2862for i in range(2, 4 + 1):
2863   for T in ('f', 'u', 'i'):
2864      vec_inst = ('vec' + str(i),)
2865
2866      indices = ['a', 'b', 'c', 'd']
2867      suffix_in = tuple((indices[j] + '@32') for j in range(i))
2868
2869      to_16 = '{}2{}16'.format(T, T)
2870      to_mp = '{}2{}mp'.format(T, T)
2871
2872      out_16 = tuple((to_16, indices[j]) for j in range(i))
2873      out_mp = tuple((to_mp, indices[j]) for j in range(i))
2874
2875      optimizations  += [
2876         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
2877      ]
2878      # u2ump doesn't exist, because it's equal to i2imp
2879      if T in ['f', 'i']:
2880          optimizations  += [
2881             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
2882          ]
2883
2884# This section contains "late" optimizations that should be run before
2885# creating ffmas and calling regular optimizations for the final time.
2886# Optimizations should go here if they help code generation and conflict
2887# with the regular optimizations.
2888before_ffma_optimizations = [
2889   # Propagate constants down multiplication chains
2890   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
2891   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
2892   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
2893   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
2894
2895   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
2896   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
2897   (('~fadd', ('fneg', a), a), 0.0),
2898   (('iadd', ('ineg', a), a), 0),
2899   (('iadd', ('ineg', a), ('iadd', a, b)), b),
2900   (('iadd', a, ('iadd', ('ineg', a), b)), b),
2901   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
2902   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
2903
2904   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
2905   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
2906   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
2907]
2908
2909# This section contains "late" optimizations that should be run after the
2910# regular optimizations have finished.  Optimizations should go here if
2911# they help code generation but do not necessarily produce code that is
2912# more easily optimizable.
2913late_optimizations = [
2914   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
2915   # results if one operand is +Inf and the other is -Inf.
2916   #
2917   # 1. Inf + -Inf = NaN
2918   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
2919   # 3. ∀x: x != NaN = true
2920   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
2921   #
2922   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
2923   #  (a+b) < 0        false            false       false    false
2924   #      a < -b       false            false       false    false
2925   # -(a+b) < 0        false            false       false    false
2926   #     -a < b        false            false       false    false
2927   #  (a+b) >= 0       false            false       false    false
2928   #      a >= -b      true             true        false    false
2929   # -(a+b) >= 0       false            false       false    false
2930   #     -a >= b       true             true        false    false
2931   #  (a+b) == 0       false            false       false    false
2932   #      a == -b      true             true        false    false
2933   #  (a+b) != 0       true             true        true     true
2934   #      a != -b      false            false       true     true
2935   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
2936   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
2937   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
2938   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
2939   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
2940   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
2941   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
2942   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
2943   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
2944   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
2945
2946   # If either source must be finite, then the original (a+b) cannot produce
2947   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
2948   # result if b is NaN. Therefore, the replacements are exact.
2949   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
2950   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
2951   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
2952   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
2953   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
2954   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
2955
2956   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
2957   # SpvOpLessOrGreater.
2958   (('iand', ('fneu', a, b),   ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),
2959   (('iand', ('fneu', a, 0.0),          ('feq', a, a)                ), ('!flt', 0.0, ('fabs', a))),
2960
2961   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
2962   # !SpvOpLessOrGreater.
2963   (('ior', ('feq', a, b),   ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),
2964   (('ior', ('feq', a, 0.0),         ('fneu', a, a),                ), ('inot', ('!flt', 0.0, ('fabs', a)))),
2965
2966   # nir_lower_to_source_mods will collapse this, but its existence during the
2967   # optimization loop can prevent other optimizations.
2968   (('fneg', ('fneg', a)), a)
2969]
2970
2971# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
2972# gets combined to fma(a, b, -c).
2973for sz, mulz in itertools.product([16, 32, 64], [False, True]):
2974    # fmulz/ffmaz only for fp32
2975    if mulz and sz != 32:
2976        continue
2977
2978    # Fuse the correct fmul. Only consider fmuls where the only users are fadd
2979    # (or fneg/fabs which are assumed to be propagated away), as a heuristic to
2980    # avoid fusing in cases where it's harmful.
2981    fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)'
2982    ffma = 'ffmaz' if mulz else 'ffma'
2983
2984    fadd = '~fadd@{}'.format(sz)
2985    option = 'options->fuse_ffma{}'.format(sz)
2986
2987    late_optimizations.extend([
2988        ((fadd, (fmul, a, b), c), (ffma, a, b, c), option),
2989
2990        ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c),
2991         (ffma, ('fneg', a), b, c), option),
2992
2993        ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c),
2994         (ffma, ('fabs', a), ('fabs', b), c), option),
2995
2996        ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c),
2997         (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option),
2998    ])
2999
3000late_optimizations.extend([
3001   # Subtractions get lowered during optimization, so we need to recombine them
3002   (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3003   (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3004   (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3005   (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'),
3006
3007   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
3008   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
3009   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
3010   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
3011   # On Intel GPUs, the constant field for an ADD3 instruction must be either
3012   # int16_t or uint16_t.
3013   (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
3014   (('iadd', ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
3015   (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'),   ('iadd3', a, b, c), 'options->has_iadd3'),
3016   (('iadd', ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'),
3017   (('iadd', ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'),
3018   (('iadd', ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'),  ('iadd3', ('ineg', a), ('ineg', b), c), 'options->has_iadd3'),
3019
3020    # fneg_lo / fneg_hi
3021   (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),
3022   (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),
3023
3024   # These are duplicated from the main optimizations table.  The late
3025   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
3026   # new patterns like these.  The patterns that compare with zero are removed
3027   # because they are unlikely to be created in by anything in
3028   # late_optimizations.
3029   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
3030   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
3031   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
3032   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
3033
3034   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
3035
3036   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
3037
3038   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
3039   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
3040   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
3041   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
3042   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
3043   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
3044   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
3045   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
3046   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
3047   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
3048
3049   (('ior', a, a), a),
3050   (('iand', a, a), a),
3051
3052   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
3053
3054   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
3055   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
3056   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
3057   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
3058
3059   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
3060
3061   # Approximate handling of fround_even for DX9 addressing from gallium nine on
3062   # DX9-class hardware with no proper fround support.  This is in
3063   # late_optimizations so that the is_integral() opts in the main pass get a
3064   # chance to eliminate the fround_even first.
3065   (('fround_even', a), ('bcsel',
3066                         ('feq', ('ffract', a), 0.5),
3067                         ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),
3068                         ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),
3069
3070   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
3071   # particular operation is common for expanding values stored in a texture
3072   # from [0,1] to [-1,1].
3073   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3074   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3075   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3076   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3077   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3078   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3079   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3080   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3081
3082    # flrp(a, b, a)
3083    # a*(1-a) + b*a
3084    # a + -a*a + a*b    (1)
3085    # a + a*(b - a)
3086    # Option 1: ffma(a, (b-a), a)
3087    #
3088    # Alternately, after (1):
3089    # a*(1+b) + -a*a
3090    # a*((1+b) + -a)
3091    #
3092    # Let b=1
3093    #
3094    # Option 2: ffma(a, 2, -(a*a))
3095    # Option 3: ffma(a, 2, (-a)*a)
3096    # Option 4: ffma(a, -a, (2*a)
3097    # Option 5: a * (2 - a)
3098    #
3099    # There are a lot of other possible combinations.
3100   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
3101   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3102   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3103   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3104   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3105
3106   # we do these late so that we don't get in the way of creating ffmas
3107   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
3108   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
3109
3110   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
3111   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
3112   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
3113    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
3114
3115   # Things that look like DPH in the source shader may get expanded to
3116   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
3117   # to NIR.  After FFMA is generated, this can look like:
3118   #
3119   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
3120   #
3121   # Reassociate the last addition into the first multiplication.
3122   #
3123   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
3124   # shader stages on some outputs that are intended to be invariant.  For
3125   # various reasons, this optimization may not be fully applied in all
3126   # shaders used for different rendering passes of the same geometry.  This
3127   # can result in Z-fighting artifacts (at best).  For now, disable this
3128   # optimization in these stages.  See bugzilla #111490.  In tessellation
3129   # stages applications seem to use 'precise' when necessary, so allow the
3130   # optimization in those stages.
3131   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3132    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3133   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3134    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3135   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3136    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3137
3138   (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3139    ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3140   (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3141    ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3142   (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3143    ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3144
3145   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
3146   #
3147   #    If bits is zero, the result will be zero.
3148   #
3149   # These prevent the next two lowerings generating incorrect results when
3150   # count is zero.
3151   (('ubfe', a, b, 0), 0),
3152   (('ibfe', a, b, 0), 0),
3153
3154   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
3155   # instructions on Intel GPUs, it cannot have an immediate values as
3156   # sources.  There are also limitations on source register strides.  As a
3157   # result, it is very easy for 3-source instruction combined with either
3158   # loads of immediate values or copies from weird register strides to be
3159   # more expensive than the primitive instructions it represents.
3160   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
3161
3162   # b is the lowest order bit to be extracted and c is the number of bits to
3163   # extract.  The inner shift removes the bits above b + c by shifting left
3164   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
3165   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
3166   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
3167   # This means that it must be shifted right by 32 - c or -c bits.
3168   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
3169
3170   # Clean up no-op shifts that may result from the bfe lowerings.
3171   (('ishl', a, 0), a),
3172   (('ishl', a, -32), a),
3173   (('ishr', a, 0), a),
3174   (('ishr', a, -32), a),
3175   (('ushr', a, 0), a),
3176
3177   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
3178   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
3179   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
3180   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
3181
3182   # open coded bit test
3183   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3184   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3185   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3186   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3187   (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'),
3188   (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'),
3189   (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'),
3190   (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'),
3191   (('bitz', ('ushr', a, b), 0), ('bitz', a, b)),
3192   (('bitz', ('ishr', a, b), 0), ('bitz', a, b)),
3193   (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)),
3194   (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)),
3195   (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3196   (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3197   (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'),
3198   (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'),
3199   (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3200   (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3201   (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'),
3202   (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'),
3203   (('inot', ('bitnz', a, b)), ('bitz', a, b)),
3204   (('inot', ('bitz', a, b)), ('bitnz', a, b)),
3205   (('bitnz', ('inot', a), b), ('bitz', a, b)),
3206   (('bitz', ('inot', a), b), ('bitnz', a, b)),
3207])
3208
3209# A few more extract cases we'd rather leave late
3210for N in [16, 32]:
3211    aN = 'a@{0}'.format(N)
3212    u2uM = 'u2u{0}'.format(M)
3213    i2iM = 'i2i{0}'.format(M)
3214
3215    for x in ['u', 'i']:
3216        x2xN = '{0}2{0}{1}'.format(x, N)
3217        extract_x8 = 'extract_{0}8'.format(x)
3218        extract_x16 = 'extract_{0}16'.format(x)
3219
3220        late_optimizations.extend([
3221            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3222            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3223        ])
3224
3225        if N > 16:
3226            late_optimizations.extend([
3227                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3228                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3229            ])
3230
3231# Byte insertion
3232late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3233late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3234late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
3235
3236late_optimizations += [
3237   # Word insertion
3238   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
3239
3240   # Extract and then insert
3241   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
3242   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
3243]
3244
3245# Integer sizes
3246for s in [8, 16, 32, 64]:
3247    late_optimizations.extend([
3248        (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),
3249        (('ior',  ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),
3250    ])
3251
3252# Float sizes
3253for s in [16, 32, 64]:
3254    late_optimizations.extend([
3255       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
3256       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
3257    ])
3258
3259for op in ['fadd']:
3260    late_optimizations += [
3261        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
3262        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
3263    ]
3264
3265for op in ['ffma', 'ffmaz']:
3266    late_optimizations += [
3267        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3268        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3269
3270        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3271        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3272    ]
3273
3274# mediump: If an opcode is surrounded by conversions, remove the conversions.
3275# The rationale is that type conversions + the low precision opcode are more
3276# expensive that the same arithmetic opcode at higher precision.
3277#
3278# This must be done in late optimizations, because we need normal optimizations to
3279# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
3280#
3281# Unary opcodes
3282for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
3283           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
3284    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
3285
3286# Binary opcodes
3287for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
3288    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
3289
3290# Ternary opcodes
3291for op in ['ffma', 'flrp']:
3292    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
3293
3294# Comparison opcodes
3295for op in ['feq', 'fge', 'flt', 'fneu']:
3296    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
3297
3298# Do this last, so that the f2fmp patterns above have effect.
3299late_optimizations += [
3300  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
3301  # any conversions that could have been removed will have been removed in
3302  # nir_opt_algebraic so any remaining ones are required.
3303  (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"),
3304  (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"),
3305  (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"),
3306  (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"),
3307  (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"),
3308  (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"),
3309  (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"),
3310  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
3311
3312  (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"),
3313
3314  (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3315  (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3316  (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3317  (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3318
3319  (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3320  (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3321  (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3322  (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3323
3324  (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3325  (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3326  (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3327  (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3328]
3329
3330distribute_src_mods = [
3331   # Try to remove some spurious negations rather than pushing them down.
3332   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
3333   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
3334   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
3335   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
3336   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
3337   (('fneg', ('fneg', a)), a),
3338
3339   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
3340   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
3341
3342   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
3343   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
3344   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
3345
3346   # Note that fmin <-> fmax.  I don't think there is a way to distribute
3347   # fabs() into fmin or fmax.
3348   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
3349   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
3350
3351   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
3352   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
3353   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
3354
3355   # fdph works mostly like fdot, but to get the correct result, the negation
3356   # must be applied to the second source.
3357   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
3358
3359   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
3360   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
3361]
3362
3363print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
3364print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
3365                                  before_ffma_optimizations).render())
3366print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
3367                                  late_optimizations).render())
3368print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
3369                                  distribute_src_mods).render())
3370