• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23
24import argparse
25from collections import OrderedDict
26import nir_algebraic
27from nir_opcodes import type_sizes
28import itertools
29import struct
30from math import pi
31import math
32
33# Convenience variables
34a = 'a'
35b = 'b'
36c = 'c'
37d = 'd'
38e = 'e'
39NAN = math.nan
40
41has_fmulz = '(options->has_fmulz || \
42              (options->has_fmulz_no_denorms && \
43               !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))'
44
45ignore_exact = nir_algebraic.ignore_exact
46
47# Written in the form (<search>, <replace>) where <search> is an expression
48# and <replace> is either an expression or a value.  An expression is
49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
50# where each source is either an expression or a value.  A value can be
51# either a numeric constant or a string representing a variable name.
52#
53# If the opcode in a search expression is prefixed by a '~' character, this
54# indicates that the operation is inexact.  Such operations will only get
55# applied to SSA values that do not have the exact bit set.  This should be
56# used by by any optimizations that are not bit-for-bit exact.  It should not,
57# however, be used for backend-requested lowering operations as those need to
58# happen regardless of precision.
59#
60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
61# "#" indicates that the given variable will only match constants,
62# type indicates that the given variable will only match values from ALU
63#    instructions with the given output type,
64# (cond) specifies an additional condition function (see nir_search_helpers.h),
65# swiz is a swizzle applied to the variable (only in the <replace> expression)
66#
67# For constants, you have to be careful to make sure that it is the right
68# type because python is unaware of the source and destination types of the
69# opcodes.
70#
71# All expression types can have a bit-size specified.  For opcodes, this
72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
73# type and size.  In the search half of the expression this indicates that it
74# should only match that particular bit-size.  In the replace half of the
75# expression this indicates that the constructed value should have that
76# bit-size.
77#
78# If the opcode in a replacement expression is prefixed by a '!' character,
79# this indicated that the new expression will be marked exact.
80#
81# A special condition "many-comm-expr" can be used with expressions to note
82# that the expression and its subexpressions have more commutative expressions
83# than nir_replace_instr can handle.  If this special condition is needed with
84# another condition, the two can be separated by a comma (e.g.,
85# "(many-comm-expr,is_used_once)").
86#
87# Another set of special "conditions" are
88# "nsz": sign of zero is not preserved
89# "ninf": infinities are not preserved
90# "nnan": nan is not preserved
91# These relate to the float controls/fpfastmath and more descriptions of the
92# expression than conditions. That is, an expression with the "nsz" condition
93# means that the replacement expression won't preserve the sign of zero of the
94# result, and so it will be skipped if the matching instruction has the
95# 'signed_zero_preserve' flag set.
96
97# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
98def lowered_sincos(c):
99    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
100    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
101    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
102
103def intBitsToFloat(i):
104    return struct.unpack('!f', struct.pack('!I', i))[0]
105
106# Takes a pattern as input and returns a list of patterns where each
107# pattern has a different permutation of fneg/fabs(value) as the replacement
108# for the key operands in replacements.
109def add_fabs_fneg(pattern, replacements, commutative = True):
110    def to_list(pattern):
111        return [to_list(i) if isinstance(i, tuple) else i for i in pattern]
112
113    def to_tuple(pattern):
114        return tuple(to_tuple(i) if isinstance(i, list) else i for i in pattern)
115
116    def replace_varible(pattern, search, replace):
117        for i in range(len(pattern)):
118            if pattern[i] == search:
119                pattern[i] = replace
120            elif isinstance(pattern[i], list):
121                replace_varible(pattern[i], search, replace)
122
123    if commutative:
124        perms = itertools.combinations_with_replacement(range(4), len(replacements))
125    else:
126        perms = itertools.product(range(4), repeat=len(replacements))
127
128    result = []
129
130    for perm in perms:
131        curr = to_list(pattern)
132
133        for i, (search, base) in enumerate(replacements.items()):
134            if perm[i] == 0:
135                replace = ['fneg', ['fabs', base]]
136            elif perm[i] == 1:
137                replace = ['fabs', base]
138            elif perm[i] == 2:
139                replace = ['fneg', base]
140            elif perm[i] == 3:
141                replace = base
142
143            replace_varible(curr, search, replace)
144
145        result.append(to_tuple(curr))
146    return result
147
148
149optimizations = [
150   # These will be recreated by late_algebraic if supported.
151   # Lowering here means we don't have to duplicate all other optimization patterns.
152   (('fgeu', a, b), ('inot', ('flt', a, b))),
153   (('fltu', a, b), ('inot', ('fge', a, b))),
154   (('fneo', 0.0, a), ('flt', 0.0, ('fabs', a))),
155   (('fequ', 0.0, a), ('inot', ('flt', 0.0, ('fabs', a)))),
156
157
158   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
159   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
160   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
161   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
162   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
163   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
164   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
165
166   (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))),
167    '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'),
168
169   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
170   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
171   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
172   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
173   (('udiv', a, 1), a),
174   (('idiv', a, 1), a),
175   (('umod', a, 1), 0),
176   (('imod', a, 1), 0),
177   (('imod', a, -1), 0),
178   (('irem', a, 1), 0),
179   (('irem', a, -1), 0),
180   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
181   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
182   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
183   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
184   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
185   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
186   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
187   (('irem', a, '#b(is_pos_power_of_two)'),
188    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
189    '!options->lower_bitops'),
190   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
191
192   (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'),
193
194   (('~fneg', ('fneg', a)), a),
195   (('ineg', ('ineg', a)), a),
196   (('fabs', ('fneg', a)), ('fabs', a)),
197   (('fabs', ('u2f', a)), ('u2f', a)),
198   (('iabs', ('iabs', a)), ('iabs', a)),
199   (('iabs', ('ineg', a)), ('iabs', a)),
200   (('~fadd', a, 0.0), a),
201   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
202   # floating point instruction, they should flush any input denormals and we
203   # can replace -0.0 with 0.0 if the float execution mode allows it.
204   (('fadd(is_only_used_as_float,nsz)', 'a', 0.0), a),
205   (('fadd(is_only_used_as_float)', a, '#b(is_negative_zero)'), a),
206   (('fadd', ('fneg', a), '#b(is_negative_zero)'), ('fneg', a)),
207   (('iadd', a, 0), a),
208   (('iadd_sat', a, 0), a),
209   (('isub_sat', a, 0), a),
210   (('uadd_sat', a, 0), a),
211   (('usub_sat', a, 0), a),
212   (('usadd_4x8_vc4', a, 0), a),
213   (('usadd_4x8_vc4', a, ~0), ~0),
214   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
215   (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
216   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
217   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
218   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
219   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
220   (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
221   (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
222   (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
223   (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
224   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
225   (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)),
226   (('iand', ('iand', a, b), ('iand(is_used_once)', a, c)), ('iand', ('iand', a, b), c)),
227   (('ior', ('ior', a, b), ('ior(is_used_once)', a, c)), ('ior', ('ior', a, b), c)),
228   (('iand', ('ior(is_used_once)', a, b), ('ior(is_used_once)', a, c)), ('ior', a, ('iand', b, c))),
229   (('ior', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), ('iand', a, ('ior', b, c))),
230   # (a & b) | (a | c)  =>  ((a & b) | a) | c  =>  a | c
231   (('ior', ('iand', a, b), ('ior', a, c)), ('ior', a, c)),
232   # (a & b) & (a | c)  =>  b & (a & (a | c))  =>  b & a
233   (('iand', ('iand', a, b), ('ior', a, c)), ('iand', a, b)),
234   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)),
235   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)),
236   (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))),
237   (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))),
238   (('~fadd', ('fneg', a), a), 0.0),
239   (('iadd', ('ineg', a), a), 0),
240   (('iadd', ('ineg', a), ('iadd', a, b)), b),
241   (('iadd', a, ('iadd', ('ineg', a), b)), b),
242   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
243   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
244   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
245   (('fadd', a, a), ('fmul', a, 2.0)),
246   (('~fmul', a, 0.0), 0.0),
247   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
248   (('fmul(nsz,nnan)', 'a', 0.0), 0.0),
249   (('fmulz', a, 0.0), 0.0),
250   (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)),
251   (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
252   (('fmulz', a, a), ('fmul', a, a)),
253   (('ffmaz(nsz)', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c)),
254   (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
255   (('ffmaz', a, a, b), ('ffma', a, a, b)),
256   (('imul', a, 0), 0),
257   (('umul_unorm_4x8_vc4', a, 0), 0),
258   (('umul_unorm_4x8_vc4', a, ~0), a),
259   (('~fmul', a, 1.0), a),
260   (('~fmulz', a, 1.0), a),
261   # The only effect a*1.0 can have is flushing denormals. If it's only used by
262   # a floating point instruction, they should flush any input denormals and
263   # this multiplication isn't needed.
264   (('fmul(is_only_used_as_float)', a, 1.0), a),
265   (('imul', a, 1), a),
266   (('fmul', a, -1.0), ('fneg', a)),
267   (('imul', a, -1), ('ineg', a)),
268   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
269   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
270   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
271   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
272   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
273   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
274   (('~ffma', 0.0, a, b), b),
275   (('ffma(is_only_used_as_float,nsz,nnan,ninf)', 0.0, a, b), b),
276   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
277   (('~ffma', a, b, 0.0), ('fmul', a, b)),
278   (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)),
279   (('ffmaz(nsz)', a, b, 0.0), ('fmulz', a, b)),
280   (('ffma', 1.0, a, b), ('fadd', a, b)),
281   (('ffmaz(nsz)', 1.0, a, b), ('fadd', a, b)),
282   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
283   (('ffmaz(nsz)', -1.0, a, b), ('fadd', ('fneg', a), b)),
284   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
285   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
286   (('~flrp', a, b, 0.0), a),
287   (('~flrp', a, b, 1.0), b),
288   (('~flrp', a, a, b), a),
289   (('~flrp', 0.0, a, b), ('fmul', a, b)),
290
291   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
292   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
293
294   (('sdot_4x8_iadd', a, 0, b), b),
295   (('udot_4x8_uadd', a, 0, b), b),
296   (('sdot_4x8_iadd_sat', a, 0, b), b),
297   (('udot_4x8_uadd_sat', a, 0, b), b),
298   (('sdot_2x16_iadd', a, 0, b), b),
299   (('udot_2x16_uadd', a, 0, b), b),
300   (('sdot_2x16_iadd_sat', a, 0, b), b),
301   (('udot_2x16_uadd_sat', a, 0, b), b),
302
303   # sudot_4x8_iadd is not commutative at all, so the patterns must be
304   # duplicated with zeros on each of the first positions.
305   (('sudot_4x8_iadd', a, 0, b), b),
306   (('sudot_4x8_iadd', 0, a, b), b),
307   (('sudot_4x8_iadd_sat', a, 0, b), b),
308   (('sudot_4x8_iadd_sat', 0, a, b), b),
309
310   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
311   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
312   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
313   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
314   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
315
316   # Try to let constant folding eliminate the dot-product part.  These are
317   # safe because the dot product cannot overflow 32 bits.
318   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
319   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
320   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
321   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
322   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
323   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
324   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
325   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
326   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
327   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
328   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
329   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
330   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
331   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
332   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
333   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
334
335   # Optimize open-coded fmulz.
336   # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)
337   *add_fabs_fneg((('fmul@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb')),
338    ('fmulz', 'ma', 'mb'), has_fmulz), {'ma' : a, 'mb' : b}),
339   *add_fabs_fneg((('fmul@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),
340    ('fmulz', 'ma', b), has_fmulz), {'ma' : a}),
341
342   # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)
343   *add_fabs_fneg((('ffma@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb'), c),
344    ('ffmaz', 'ma', 'mb', c), has_fmulz), {'ma' : a, 'mb' : b}),
345   *add_fabs_fneg((('ffma@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),
346    ('ffmaz', 'ma', b, c), has_fmulz), {'ma' : a}),
347
348   # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b))
349   *add_fabs_fneg((('bcsel(nsz,nnan,ninf)', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, 'mb'))),
350    ('fexp2', ('fmulz', a, 'mb')),
351    has_fmulz), {'mb': b}),
352   *add_fabs_fneg((('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmulz', a, 'mb'))),
353    ('fexp2', ('fmulz', a, 'mb'))), {'mb': b}),
354]
355
356# Bitwise operations affecting the sign may be replaced by equivalent
357# floating point operations, except possibly for denormal
358# behaviour hence the is_only_used_as_float.
359for sz in (16, 32, 64):
360    sign_bit = 1 << (sz - 1)
361
362    optimizations.extend([
363        (('iand(is_only_used_as_float)', f'a@{sz}', sign_bit - 1), ('fabs', a)),
364        (('ixor(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', a)),
365        (('ior(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', ('fabs', a))),
366    ])
367
368# Shorthand for the expansion of just the dot product part of the [iu]dp4a
369# instructions.
370sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
371                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
372                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
373                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
374udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
375                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
376                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
377                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
378sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
379                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
380                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
381                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
382sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
383                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
384udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
385                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
386
387optimizations.extend([
388   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
389   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
390   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
391   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
392   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
393
394   # For the unsigned dot-product, the largest possible value 4*(255*255) =
395   # 0x3f804, so we don't have to worry about that intermediate result
396   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
397   # that is less than 0xfffc07fc, then the result cannot overflow ever.
398   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
399   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'),
400
401   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
402   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
403   # don't have to worry about that intermediate result overflowing or
404   # underflowing.
405   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'),
406
407   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'),
408
409   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
410   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
411])
412
413# Float sizes
414for s in [16, 32, 64]:
415    optimizations.extend([
416       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
417
418       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
419       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
420       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
421
422       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
423       # These are the same as the previous three rules, but it depends on
424       # 1-fsat(x) <=> fsat(1-x).  See below.
425       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
426       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
427
428       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
429       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
430
431       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
432       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
433
434       # These two aren't flrp lowerings, but do appear in some shaders.
435       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
436       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
437
438       # 1 - ((1 - a) * (1 - b))
439       # 1 - (1 - a - b + a*b)
440       # 1 - 1 + a + b - a*b
441       # a + b - a*b
442       # a + b*(1 - a)
443       # b*(1 - a) + 1*a
444       # flrp(b, 1, a)
445       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
446    ])
447
448optimizations.extend([
449   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
450
451   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
452
453   # D3D9 vertex shader trunc
454   (('fadd', ('ffloor', a), ('b2f', ('iand', ('flt', a, 0), ('flt', ('fneg', ('ffract', a)), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))),
455   # D3D9 pixel shader trunc
456   (('fadd', ('ffloor', a), ('b2f', ('inot', ('fge', 0, ('fmin', ('fneg', a), ('ffract', a)))))), ('ftrunc', ('fadd', a, 0))),
457   (('fadd', ('ffloor', a), ('b2f', ('flt', 0, ('fmin', ('fneg', a), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))),
458
459   (('fadd(nnan,nsz)', a, ('ffract', ('fneg', a))), ('fceil', a), '!options->lower_fceil'),
460
461   (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
462   (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
463   (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))),
464    '(options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)) && (!(options->lower_doubles_options & nir_lower_dfloor) || !(options->lower_doubles_options & nir_lower_dfract))'),
465
466   (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
467   (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
468   (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'),
469   (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'),
470   (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'),
471   (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
472   (('fadd@16(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
473   (('fadd@32(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
474   (('fadd@64(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
475   (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
476   (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
477   (('ffract@64', a), ('fsub', a, ('ffloor', a)),
478    '(options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)) && !(options->lower_doubles_options & nir_lower_dfloor)'),
479   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
480   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
481   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
482   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
483   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
484   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
485   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
486   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
487   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
488   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
489
490   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
491    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
492
493   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
494
495   (('fdot4', a, 0.0), 0.0),
496   (('fdot3', a, 0.0), 0.0),
497   (('fdot2', a, 0.0), 0.0),
498
499   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
500   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
501   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
502   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
503
504   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
505   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
506
507   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
508   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
509
510   # Lower fdot to fsum when it is available
511   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
512   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
513   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
514   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
515
516   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
517   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
518   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
519   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
520
521   # (a * #b + #c) << #d
522   # ((a * #b) << #d) + (#c << #d)
523   # (a * (#b << #d)) + (#c << #d)
524   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
525    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
526
527   # (a * #b) << #c
528   # a * (#b << #c)
529   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
530])
531
532# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
533# bits of the second source.  These replacements must correctly handle the
534# case where (b % bitsize) + (c % bitsize) >= bitsize.
535for s in [8, 16, 32, 64]:
536   mask = s - 1
537
538   ishl = "ishl@{}".format(s)
539   ishr = "ishr@{}".format(s)
540   ushr = "ushr@{}".format(s)
541
542   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
543
544   optimizations.extend([
545       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
546       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
547
548       # To get get -1 for large shifts of negative values, ishr must instead
549       # clamp the shift count to the maximum value.
550       ((ishr, (ishr, a, '#b'), '#c'),
551        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
552   ])
553
554# Optimize a pattern of address calculation created by DXVK where the offset is
555# divided by 4 and then multipled by 4. This can be turned into an iand and the
556# additions before can be reassociated to CSE the iand instruction.
557
558for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
559    a_sz = 'a@{}'.format(size)
560
561    optimizations.extend([
562       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
563       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
564       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
565
566       # This does not trivially work with ishr.
567       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
568    ])
569
570# Collapses ubfe(ubfe(a, b, c), d, e) when b, c, d, e are constants.
571def ubfe_ubfe(a, b, c, d, e):
572    inner_offset = ('iand', b, 0x1f)
573    inner_bits = ('umin', ('iand', c, 0x1f), ('isub', 32, inner_offset))
574    outer_offset = ('iand', d, 0x1f)
575    outer_bits = ('iand', e, 0x1f)
576
577    offset = ('iadd', inner_offset, outer_offset)
578    bits = ('umin', outer_bits, ('imax', ('isub', inner_bits, outer_offset), 0))
579    collapsed = ('ubfe', a, offset, bits)
580    offset_out_of_range = ('ilt', 31, offset)
581
582    # This will be constant-folded to either 0 or the collapsed ubfe,
583    # whose offset and bits operands will also be constant folded.
584    return ('bcsel', offset_out_of_range, 0, collapsed)
585
586optimizations.extend([
587    # Create bitfield extract from right-shift + and pattern.
588    (('iand@32', ('ushr@32(is_used_once)', a, b), '#c(is_const_bitmask)'),
589     ('ubfe', a, b, ('bit_count', c)),
590     'options->has_bfe && !options->avoid_ternary_with_two_constants'),
591
592    (('iand@32', ('ushr@32', a, b), ('bfm', c, 0)),
593     ('ubfe', a, b, c), 'options->has_bfe'),
594
595    (('ushr', ('iand', a, ('bfm', c, b)), b),
596     ('ubfe', a, b, c), 'options->has_bfe'),
597
598    # Collapse two bitfield extracts with constant operands into a single one.
599    (('ubfe', ('ubfe', a, '#b', '#c'), '#d', '#e'),
600     ubfe_ubfe(a, b, c, d, e)),
601
602    # Collapse non-zero right-shift into bitfield extract.
603    (('ushr@32', ('ubfe', a, '#b', '#c'), '#d(is_5lsb_not_zero)'),
604     ubfe_ubfe(a, b, c, d, 31)),
605
606    (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),
607    (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),
608])
609
610for log2 in range(1, 7): # powers of two from 2 to 64
611   v = 1 << log2
612   mask = 0xffffffff & ~(v - 1)
613   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
614
615   optimizations.extend([
616       # Reassociate for improved CSE
617       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
618   ])
619
620# To save space in the state tables, reduce to the set that is known to help.
621# Previously, this was range(1, 32).  In addition, a couple rules inside the
622# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
623# resolution.
624for i in [1, 2, 16, 24]:
625    lo_mask = 0xffffffff >> i
626    hi_mask = (0xffffffff << i) & 0xffffffff
627
628    optimizations.extend([
629        # This pattern seems to only help in the soft-fp64 code.
630        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
631#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
632#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
633
634        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
635        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
636#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
637    ])
638
639optimizations.extend([
640   # This is common for address calculations.  Reassociating may enable the
641   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
642   # instruction or a constant offset field for in load / store instructions.
643   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
644
645   # (a + #b) * #c => (a * #c) + (#b * #c)
646   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
647
648   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
649   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
650    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
651   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
652    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
653
654   # Comparison simplifications
655   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
656   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
657   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
658   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
659   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
660   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
661   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
662   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
663   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
664   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
665
666   (('iand', ('feq', a, b), ('fneu', a, b)), False),
667   (('iand', ('flt', a, b), ('flt', b, a)), False),
668   (('iand', ('ieq', a, b), ('ine', a, b)), False),
669   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
670   (('iand', ('ult', a, b), ('ult', b, a)), False),
671
672   # This helps some shaders because, after some optimizations, they end up
673   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
674   # matching would be handled by CSE.
675   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
676   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
677   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
678   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
679   (('flt', ('fneg', 'a(is_not_const)'), '#b'), ('flt', ('fneg', b), a)),
680   (('flt', '#b', ('fneg', 'a(is_not_const)')), ('flt', a, ('fneg', b))),
681   (('fge', ('fneg', 'a(is_not_const)'), '#b'), ('fge', ('fneg', b), a)),
682   (('fge', '#b', ('fneg', 'a(is_not_const)')), ('fge', a, ('fneg', b))),
683   (('fneu', ('fneg', 'a(is_not_const)'), '#b'), ('fneu', ('fneg', b), a)),
684   (('feq', '#b', ('fneg', 'a(is_not_const)')), ('feq', a, ('fneg', b))),
685   (('flt', a, '#b(is_negative_zero)'), ('flt', a, 0.0)),
686   (('flt', '#b(is_negative_zero)', a), ('flt', 0.0, a)),
687   (('fge', a, '#b(is_negative_zero)'), ('fge', a, 0.0)),
688   (('fge', '#b(is_negative_zero)', a), ('fge', 0.0, a)),
689   (('fneu', a, '#b(is_negative_zero)'), ('fneu', 0.0, a)),
690   (('feq', '#b(is_negative_zero)', a), ('feq', a, 0.0)),
691
692   (('ieq', ('ineg', a), 0),  ('ieq', a, 0)),
693   (('ine', ('ineg', a), 0),  ('ine', a, 0)),
694   (('ieq', ('iabs', a), 0),  ('ieq', a, 0)),
695   (('ine', ('iabs', a), 0),  ('ine', a, 0)),
696   (('fneu', ('fabs', a), 0.0), ('fneu', a, 0.0)),
697   (('feq', ('fabs', a), 0.0), ('feq', a, 0.0)),
698   (('fneu', ('fabs', a), ('fabs', a)), ('fneu', a, a)),
699   (('feq', ('fabs', a), ('fabs', a)), ('feq', a, a)),
700
701   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
702   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
703
704   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
705   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
706
707   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
708   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
709
710   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
711   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
712
713   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
714   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
715
716   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
717   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
718
719   # 0.0 >= b2f(a)
720   # b2f(a) <= 0.0
721   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
722   # inot(a)
723   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
724
725   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
726
727   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
728   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
729   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
730   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
731   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
732   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
733   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
734   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
735   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
736   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
737   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
738   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
739   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
740   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
741   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
742   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
743
744   # -(b2f(a) + b2f(b)) < 0
745   # 0 < b2f(a) + b2f(b)
746   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
747   # a || b
748   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
749   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
750
751   # -(b2f(a) + b2f(b)) >= 0
752   # 0 >= b2f(a) + b2f(b)
753   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
754   # !(a || b)
755   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
756   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
757
758   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
759   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
760
761   # Some optimizations (below) convert things like (a < b || c < b) into
762   # (min(a, c) < b).  However, this interfers with the previous optimizations
763   # that try to remove comparisons with negated sums of b2f.  This just
764   # breaks that apart.
765   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
766    ('ior', ('flt', c, 0.0), ('ior', a, b))),
767
768   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
769   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
770   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
771   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
772   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
773   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
774   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
775   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
776   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
777   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
778   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
779   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
780
781   # Cannot remove the addition from ilt or ige due to overflow.
782   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
783   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
784
785   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
786   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
787   (('fneu', ('b2f', 'a@1'), 0.0), a),
788   (('flt',  0.0, ('b2f', 'a@1')), a),
789   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
790   (('ine', ('b2i', 'a@1'), 0),   a),
791   (('ieq', 'a@1', False), ('inot', a)),
792   (('ieq', 'a@1', True), a),
793   (('ine', 'a@1', False), a),
794   (('ine', 'a@1', True), ('inot', a)),
795
796   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
797   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
798   (('fge', ('u2f', a), 0.0), True),
799   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
800   (('flt', ('u2f', a), 0.0), False),
801   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
802   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
803   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
804   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
805   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
806   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
807   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
808
809   # 0.0 < fabs(a)
810   # fabs(a) > 0.0
811   # fabs(a) != 0.0 because fabs(a) must be >= 0
812   # a != 0.0
813   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
814
815   # -fabs(a) < 0.0
816   # fabs(a) > 0.0
817   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
818
819   # 0.0 >= fabs(a)
820   # 0.0 == fabs(a)   because fabs(a) must be >= 0
821   # 0.0 == a
822   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
823
824   # -fabs(a) >= 0.0
825   # 0.0 >= fabs(a)
826   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
827
828   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
829   #
830   # This should be NaN safe.
831   #
832   # NaN >= 0 && 1 >= NaN -> false && false -> false
833   #
834   # vs.
835   #
836   # NaN == fsat(NaN) -> NaN == 0 -> false
837   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
838
839   # Note: fmin(-a, -b) == -fmax(a, b)
840   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
841   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
842   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
843   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
844
845   # fmin(b2f(a), b)
846   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
847   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
848   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
849   #
850   # Since b is a constant, constant folding will eliminate the fmin and the
851   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
852   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
853
854   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
855
856   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
857   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
858   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
859   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
860   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
861   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
862   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
863   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
864   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
865   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
866   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
867   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
868   (('bcsel', a, True, b), ('ior', a, b)),
869   (('bcsel', a, a, b), ('ior', a, b)),
870   (('bcsel', a, b, False), ('iand', a, b)),
871   (('bcsel', a, b, a), ('iand', a, b)),
872   (('~fmin', a, a), a),
873   (('~fmax', a, a), a),
874   (('imin', a, a), a),
875   (('imax', a, a), a),
876   (('umin', a, a), a),
877   (('umin', a, 0), 0),
878   (('umin', a, -1), a),
879   (('umax', a, a), a),
880   (('umax', a, 0), a),
881   (('umax', a, -1), -1),
882   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
883   (('umax', ('umax', a, b), b), ('umax', a, b)),
884   (('imax', ('imax', a, b), b), ('imax', a, b)),
885   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
886   (('umin', ('umin', a, b), b), ('umin', a, b)),
887   (('imin', ('imin', a, b), b), ('imin', a, b)),
888   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
889   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
890   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
891   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
892   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
893   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
894   (('fmin', ('fmax', 'a(is_finite)', b), a), ('fmul', 1.0, a)),
895   (('fmax', ('fmin', 'a(is_finite)', b), a), ('fmul', 1.0, a)),
896   (('umin', ('umax', a, b), a), a),
897   (('umax', ('umin', a, b), a), a),
898   (('imin', ('imax', a, b), a), a),
899   (('imax', ('imin', a, b), a), a),
900])
901
902for N in [8, 16, 32, 64]:
903    b2iN = 'b2i{0}'.format(N)
904    optimizations.extend([
905        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
906        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
907    ])
908
909for N in [16, 32, 64]:
910    b2fN = 'b2f{0}'.format(N)
911    optimizations.extend([
912        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
913        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
914    ])
915
916# Integer sizes
917for s in [8, 16, 32, 64]:
918    optimizations.extend([
919       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
920
921       # Simplify logic to detect sign of an integer.
922       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
923       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
924       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
925       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
926       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
927       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
928       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
929       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
930       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
931       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
932       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
933       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
934    ])
935
936optimizations.extend([
937   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
938   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
939   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
940   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
941   (('~fmin', a, ('fabs', a)), a),
942   (('imin', a, ('iabs', a)), a),
943   (('~fmax', a, ('fneg', ('fabs', a))), a),
944   (('imax', a, ('ineg', ('iabs', a))), a),
945   (('fmax', a, ('fabs', a)), ('fabs', a)),
946   (('imax', a, ('iabs', a)), ('iabs', a)),
947   (('fmax', a, ('fneg', a)), ('fabs', a)),
948   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
949   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
950   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
951   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
952   # fsat(a) returns 0.0.
953   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
954   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
955   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
956   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
957   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
958   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
959   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
960   # representing this in the optimizations other than the usual ~.
961   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
962   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
963   # the new comparison precise to prevent it being changed to 'a != 0'.
964   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
965   (('fsat', ('b2f', a)), ('b2f', a)),
966   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
967   (('fsat', ('fsat', a)), ('fsat', a)),
968   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
969   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
970   (('fsat(nsz)', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat'),
971   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
972   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
973   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
974   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
975   # Both the left and right patterns are "b" when isnan(a), so this is exact.
976   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
977   (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))),
978   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
979   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
980   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
981
982   # If a >= 0 ... 1 + a >= 1 ... so fsat(1 + a) = 1
983   (('fsat', ('fadd', 1.0, 'a(is_ge_zero)')), 1.0),
984
985   # Let constant folding do its job. This can have emergent behaviour.
986   (('fneg', ('bcsel(is_used_once)', a, '#b', '#c')), ('bcsel', a, ('fneg', b), ('fneg', c))),
987
988   # max(-min(b, a), b) -> max(abs(b), -a)
989   # min(-max(b, a), b) -> min(-abs(b), -a)
990   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
991   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
992
993   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
994   # fsat(b-a).
995   #
996   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
997   #
998   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
999   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
1000    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
1001
1002   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
1003
1004   # The ior versions are exact because fmin and fmax will always pick a
1005   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
1006   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
1007   # to prevent other optimizations from ruining the "NaN clensing" property
1008   # of the fmin or fmax.
1009   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
1010   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
1011   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
1012   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
1013   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
1014   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
1015   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
1016   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
1017   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
1018   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
1019   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
1020   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
1021   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
1022   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
1023   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
1024   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
1025
1026   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
1027   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
1028   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
1029   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
1030   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
1031   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
1032   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
1033   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
1034   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
1035   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
1036   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
1037   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
1038   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
1039   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
1040   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
1041   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
1042
1043   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
1044   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
1045   # single step.  Doing just the replacement can lead to an infinite loop as
1046   # the pattern is repeatedly applied to the result of the previous
1047   # application of the pattern.
1048   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
1049   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
1050   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
1051   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
1052
1053   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
1054   # numbers, then it can be replaced with fneu.
1055   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
1056
1057   # Other patterns may optimize the resulting iand tree further.
1058   (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)),
1059    ('iand', ('iand', a, b), ('iand', c, b))),
1060])
1061
1062# Float sizes
1063for s in [16, 32, 64]:
1064    if s == 64:
1065        match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)"
1066    else:
1067        match_fsign_cond = "!options->lower_fsign"
1068    optimizations.extend([
1069       # These derive from the previous patterns with the application of b < 0 <=>
1070       # 0 < -b.  The transformation should be applied if either comparison is
1071       # used once as this ensures that the number of comparisons will not
1072       # increase.  The sources to the ior and iand are not symmetric, so the
1073       # rules have to be duplicated to get this behavior.
1074       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
1075       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
1076       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
1077       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
1078       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
1079       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
1080       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
1081       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
1082
1083       (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
1084       (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
1085       (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
1086       (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
1087
1088       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
1089       # with the bcsel, it's basically copysign(1.0, a).  There are some
1090       # behavior differences between this pattern and copysign w.r.t. ±0 and
1091       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
1092       # to x, regardless of whether either or both values are NaN.
1093       #
1094       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
1095       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
1096       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
1097       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
1098       #
1099       # For all other values of 'a', the original and replacement behave as
1100       # copysign.
1101       #
1102       # Marking the replacement comparisons as precise prevents any future
1103       # optimizations from replacing either of the comparisons with the
1104       # logical-not of the other.
1105       #
1106       # Note: Use b2i32 in the replacement because some platforms that
1107       # support fp16 don't support int16.
1108       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
1109        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
1110
1111       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
1112
1113       # The C spec says, "If the value of the integral part cannot be represented
1114       # by the integer type, the behavior is undefined."  "Undefined" can mean
1115       # "the conversion doesn't happen at all."
1116       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
1117
1118       # Ironically, mark these as imprecise because removing the conversions may
1119       # preserve more precision than doing the conversions (e.g.,
1120       # uint(float(0x81818181u)) == 0x81818200).
1121       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
1122       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
1123       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
1124       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
1125
1126       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond),
1127       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond),
1128
1129       # float? -> float? -> floatS ==> float? -> floatS
1130       (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
1131
1132       # int? -> float? -> floatS ==> int? -> floatS
1133       (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
1134       (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
1135
1136       # float? -> float? -> intS ==> float? -> intS
1137       (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
1138       (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
1139
1140       # HLSL's sign function returns an integer
1141       (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)),
1142    ])
1143
1144    for B in [32, 64]:
1145        if s < B:
1146            optimizations.extend([
1147               # S = smaller, B = bigger
1148               # floatS -> floatB -> floatS ==> identity
1149               (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
1150
1151               # floatS -> floatB -> intB ==> floatS -> intB
1152               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
1153               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
1154
1155               # int? -> floatB -> floatS ==> int? -> floatS
1156               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
1157               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
1158            ])
1159
1160for S in [1, 8, 16, 32]:
1161    for B in [8, 16, 32, 64]:
1162        if B <= S:
1163            continue
1164        optimizations.extend([
1165            # intS -> intB -> intS ==> identity
1166            (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a),
1167            (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a),
1168        ])
1169
1170        if B < 16:
1171            continue
1172        for C in [8, 16, 32, 64]:
1173            if C <= S:
1174                continue
1175            optimizations.extend([
1176                # intS -> intC -> floatB ==> intS -> floatB
1177                (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)),
1178                (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)),
1179            ])
1180
1181# mediump variants of the above
1182optimizations.extend([
1183    # int32 -> float32 -> float16 ==> int32 -> float16
1184    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
1185    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
1186
1187    # float32 -> float16 -> int16 ==> float32 -> int16
1188    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
1189    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
1190
1191    # float32 -> int32 -> int16 ==> float32 -> int16
1192    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
1193    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
1194
1195    # int32 -> int16 -> float16 ==> int32 -> float16
1196    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
1197    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
1198])
1199
1200# Clean up junk left from 8-bit integer to 16-bit integer lowering.
1201optimizations.extend([
1202    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
1203    # accomplished by mask the upper 8-bit of the immediate operand to the
1204    # iand instruction.  Often times, both patterns will end up being applied
1205    # to the same original expression tree.
1206    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
1207    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
1208])
1209
1210for op in ['iand', 'ior', 'ixor']:
1211    optimizations.extend([
1212        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
1213        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
1214
1215        # Undistribute extract from a logic op
1216        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
1217        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
1218        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
1219        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
1220
1221        # Undistribute shifts from a logic op
1222        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
1223        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
1224        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
1225    ])
1226
1227# Integer sizes
1228for s in [8, 16, 32, 64]:
1229    amount_bits = int(math.log2(s))
1230
1231    lower_umin = 'options->lower_umin'
1232    lower_umax = 'options->lower_umax'
1233    lower_imin = 'false'
1234    lower_imax = 'false'
1235    lower_ior = 'options->lower_bitops'
1236    if s == 64:
1237       lower_umin = '(options->lower_umin || (options->lower_int64_options & nir_lower_minmax64) != 0)'
1238       lower_umax = '(options->lower_umax || (options->lower_int64_options & nir_lower_minmax64) != 0)'
1239       lower_imin = '((options->lower_int64_options & nir_lower_minmax64) != 0)'
1240       lower_imax = '((options->lower_int64_options & nir_lower_minmax64) != 0)'
1241       lower_ior = '(options->lower_bitops || (options->lower_int64_options & nir_lower_logic64) != 0)'
1242
1243    optimizations.extend([
1244       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), lower_umax + ' && !' + lower_ior),
1245       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), lower_umin + ' && !' + lower_ior),
1246       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!'+lower_umax),
1247       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!'+lower_umin),
1248       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!'+lower_umin),
1249       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!'+lower_umax),
1250
1251       (('bcsel', ('ult', 'b@{}'.format(s), a), b, a), ('umin', a, b), '!'+lower_umin),
1252       (('bcsel', ('ult', 'a@{}'.format(s), b), b, a), ('umax', a, b), '!'+lower_umax),
1253       (('bcsel', ('uge', 'a@{}'.format(s), b), b, a), ('umin', a, b), '!'+lower_umin),
1254       (('bcsel', ('uge', 'b@{}'.format(s), a), b, a), ('umax', a, b), '!'+lower_umax),
1255       (('bcsel', ('ilt', 'b@{}'.format(s), a), b, a), ('imin', a, b), '!'+lower_imin),
1256       (('bcsel', ('ilt', 'a@{}'.format(s), b), b, a), ('imax', a, b), '!'+lower_imax),
1257       (('bcsel', ('ige', 'a@{}'.format(s), b), b, a), ('imin', a, b), '!'+lower_imin),
1258       (('bcsel', ('ige', 'b@{}'.format(s), a), b, a), ('imax', a, b), '!'+lower_imax),
1259
1260       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
1261       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
1262
1263       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
1264       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
1265       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
1266       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
1267       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), amount_bits - 1)), ('ushr', a, ('ishl', b, amount_bits - 1))),
1268       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 3), amount_bits - 2)), ('ushr', a, ('ishl', b, amount_bits - 2))),
1269    ])
1270
1271optimizations.extend([
1272   # Common pattern like 'if (i == 0 || i == 1 || ...)'
1273   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
1274   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
1275   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
1276   (('ior', a, ('ieq', a, False)), True),
1277
1278   (('uge', a, 1), ('ine', a, 0)),
1279   (('ult', a, 1), ('ieq', a, 0)),
1280
1281   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
1282   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
1283
1284   (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'),
1285    ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)),
1286              ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))),
1287              ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b)))
1288    )
1289   ),
1290
1291   (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)),
1292    ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'),
1293
1294   (('ior',  ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)),
1295   (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)),
1296
1297   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1298   # The first part of the iand comes from the !__feq64_nonnan.
1299   #
1300   # The second pattern is a reformulation of the first based on the relation
1301   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1302   # happens to be y == 0.
1303   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1304    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1305   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1306    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1307
1308   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1309   # transformations occur before constant propagation and loop-unrolling.
1310   #
1311   # The flt versions are exact.  If isnan(a), the original pattern is
1312   # trivially false, and the replacements are false too.  If isnan(b):
1313   #
1314   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1315   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1316   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1317   (('~fge', a, ('fmin', b, a)), True),
1318   (('~fge', ('fmax', a, b), a), True),
1319   (('flt', a, ('fmin', b, a)), False),
1320   (('flt', ('fmax', a, b), a), False),
1321   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1322   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1323
1324   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1325   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1326   (('ige', a, ('imin', b, a)), True),
1327   (('ige', ('imax', a, b), a), True),
1328   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1329   (('ult', ('umin', a, b), a), ('ult', b, a)),
1330   (('uge', a, ('umin', b, a)), True),
1331   (('uge', ('umax', a, b), a), True),
1332   (('ilt', a, ('imin', b, a)), False),
1333   (('ilt', ('imax', a, b), a), False),
1334   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1335   (('ige', ('imin', a, b), a), ('ige', b, a)),
1336   (('ult', a, ('umin', b, a)), False),
1337   (('ult', ('umax', a, b), a), False),
1338   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1339   (('uge', ('umin', a, b), a), ('uge', b, a)),
1340   (('ult', a, ('iand', b, a)), False),
1341   (('ult', ('ior', a, b), a), False),
1342   (('uge', a, ('iand', b, a)), True),
1343   (('uge', ('ior', a, b), a), True),
1344
1345   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1346   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1347   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1348   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1349   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1350   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1351   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1352   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1353   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1354   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1355   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1356   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1357   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1358   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1359   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1360   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1361
1362   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1363   # negative.
1364   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1365    ('iabs', ('ishr', a, b))),
1366   (('iabs', ('ishr', ('iabs', a), b)), ('ushr', ('iabs', a), b)),
1367   (('iabs', ('ushr', ('iabs', a), b)), ('ushr', ('iabs', a), b)),
1368
1369   (('fabs', ('slt', a, b)), ('slt', a, b)),
1370   (('fabs', ('sge', a, b)), ('sge', a, b)),
1371   (('fabs', ('seq', a, b)), ('seq', a, b)),
1372   (('fabs', ('sne', a, b)), ('sne', a, b)),
1373   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1374   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1375   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1376   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1377   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1378   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1379   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1380   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1381   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1382   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1383   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1384   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1385   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1386   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1387   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1388   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1389   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1390   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1391   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1392   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1393   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1394   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1395   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1396   (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'),
1397   (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'),
1398   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1399   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1400   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1401   (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1402   (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1403])
1404
1405def vector_cmp(reduce_op, cmp_op, comps):
1406   if len(comps) == 1:
1407      return (cmp_op, 'a.' + comps[0], 'b.' + comps[0])
1408   else:
1409      mid = len(comps) // 2
1410      return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]),
1411                         vector_cmp(reduce_op, cmp_op, comps[mid:]))
1412
1413for op in [
1414   ('ball_iequal', 'ieq', 'iand'),
1415   ('ball_fequal', 'feq', 'iand'),
1416   ('bany_inequal', 'ine', 'ior'),
1417   ('bany_fnequal', 'fneu', 'ior'),
1418]:
1419   optimizations.extend([
1420      ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'),
1421      ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'),
1422      ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'),
1423      ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'),
1424      ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'),
1425   ])
1426
1427# D3D Boolean emulation
1428for s in [8, 16, 32, 64]:
1429   cond = 'true'
1430   if s == 64:
1431       cond = '!(options->lower_int64_options & nir_lower_conv64)'
1432
1433   optimizations.extend([
1434      (('bcsel@{}'.format(s), a, -1, 0), ('ineg', ('b2i', 'a@1')), cond),
1435      (('bcsel@{}'.format(s), a, 0, -1), ('ineg', ('b2i', ('inot', a))), cond),
1436      (('bcsel@{}'.format(s), a, 1, 0), ('b2i', 'a@1'), cond),
1437      (('bcsel@{}'.format(s), a, 0, 1), ('b2i', ('inot', a)), cond),
1438   ])
1439
1440optimizations.extend([
1441   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1442    ('ineg', ('b2i', ('iand', a, b)))),
1443   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1444    ('ineg', ('b2i', ('ior', a, b)))),
1445   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1446   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1447   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1448   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1449   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1450   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1451   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1452])
1453
1454optimizations.extend([
1455   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
1456   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
1457   (('feq', ('slt', a, b), 1.0), ('flt', a, b)),
1458   (('feq', ('sge', a, b), 1.0), ('fge', a, b)),
1459   (('fneu', ('seq', a, b), 0.0), ('feq', a, b)),
1460   (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),
1461   (('fneu', ('slt', a, b), 0.0), ('flt', a, b)),
1462   (('fneu', ('sge', a, b), 0.0), ('fge', a, b)),
1463   (('feq', ('seq', a, b), 0.0), ('fneu', a, b)),
1464   (('feq', ('sne', a, b), 0.0), ('feq', a, b)),
1465   (('feq', ('slt', a, b), 0.0), ('fge', a, b)),
1466   (('feq', ('sge', a, b), 0.0), ('flt', a, b)),
1467   (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),
1468   (('fneu', ('sne', a, b), 1.0), ('feq', a, b)),
1469   (('fneu', ('slt', a, b), 1.0), ('fge', a, b)),
1470   (('fneu', ('sge', a, b), 1.0), ('flt', a, b)),
1471
1472   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1473   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1474   # Emulating booleans
1475   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1476   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1477   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1478   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1479   (('ffma', ('b2f', 'a@1'), ('b2f', 'b@1'), c), ('fadd', ('b2f', ('iand', a, b)), c)),
1480   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1481   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1482   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1483   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1484   # Comparison with the same args.  Note that these are only done for the
1485   # float versions when the source must be a number.  Generally, NaN cmp NaN
1486   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1487   # is false, and, for any number X, X < X is also false.
1488   (('ilt', a, a), False),
1489   (('ige', a, a), True),
1490   (('ieq', a, a), True),
1491   (('ine', a, a), False),
1492   (('ult', a, a), False),
1493   (('uge', a, a), True),
1494   (('flt', a, a), False),
1495   (('fge', 'a(is_a_number)', a), True),
1496   (('feq', 'a(is_a_number)', a), True),
1497   (('fneu', 'a(is_a_number)', a), False),
1498   # Logical and bit operations
1499   (('iand', a, a), a),
1500   (('iand', a, 0), 0),
1501   (('iand', a, -1), a),
1502   (('iand', a, ('inot', a)), 0),
1503   (('ior', a, a), a),
1504   (('ior', a, 0), a),
1505   (('ior', a, -1), -1),
1506   (('ior', a, ('inot', a)), -1),
1507   (('ixor', a, a), 0),
1508   (('ixor', a, 0), a),
1509   (('ixor', a, ('ixor', a, b)), b),
1510   (('ixor', a, -1), ('inot', a)),
1511   (('inot', ('inot', a)), a),
1512   (('ior', ('iand', a, b), b), b),
1513   (('ior', ('ior', a, b), b), ('ior', a, b)),
1514   (('iand', ('ior', a, b), b), b),
1515   (('iand', ('iand', a, b), b), ('iand', a, b)),
1516
1517   # It is common for sequences of (x & 1) to occur in large trees.  Replacing
1518   # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "&
1519   # 1" to eventually bubble up to the top of the tree.
1520   (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)),
1521    ('iand', a, ('iand', b, c))),
1522
1523   (('iand@64', a, '#b(is_lower_half_zero)'),
1524    ('pack_64_2x32_split', 0,
1525                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1526     '!options->lower_pack_64_2x32_split'),
1527   (('iand@64', a, '#b(is_upper_half_zero)'),
1528    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1529                           0),
1530     '!options->lower_pack_64_2x32_split'),
1531   (('iand@64', a, '#b(is_lower_half_negative_one)'),
1532    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1533                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1534     '!options->lower_pack_64_2x32_split'),
1535   (('iand@64', a, '#b(is_upper_half_negative_one)'),
1536    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1537                           ('unpack_64_2x32_split_y', a)),
1538     '!options->lower_pack_64_2x32_split'),
1539
1540   (('ior@64', a, '#b(is_lower_half_zero)'),
1541    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1542                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1543     '!options->lower_pack_64_2x32_split'),
1544   (('ior@64', a, '#b(is_upper_half_zero)'),
1545    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1546                           ('unpack_64_2x32_split_y', a)),
1547     '!options->lower_pack_64_2x32_split'),
1548   (('ior@64', a, '#b(is_lower_half_negative_one)'),
1549    ('pack_64_2x32_split', -1,
1550                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1551     '!options->lower_pack_64_2x32_split'),
1552   (('ior@64', a, '#b(is_upper_half_negative_one)'),
1553    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1554                           -1),
1555     '!options->lower_pack_64_2x32_split'),
1556
1557   (('ixor@64', a, '#b(is_lower_half_zero)'),
1558    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1559                           ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1560     '!options->lower_pack_64_2x32_split'),
1561   (('ixor@64', a, '#b(is_upper_half_zero)'),
1562    ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1563                           ('unpack_64_2x32_split_y', a)),
1564     '!options->lower_pack_64_2x32_split'),
1565
1566   # DeMorgan's Laws
1567   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1568   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1569   # Shift optimizations
1570   (('ishl', 0, a), 0),
1571   (('ishl', a, 0), a),
1572   (('ishr', 0, a), 0),
1573   (('ishr', -1, a), -1),
1574   (('ishr', a, 0), a),
1575   (('ushr', 0, a), 0),
1576   (('ushr', a, 0), a),
1577   (('bcsel', ('ieq', b, 0), a, ('ushr', a, b)), ('ushr', a, b)),
1578   (('bcsel', ('ieq', b, 0), a, ('ishr', a, b)), ('ishr', a, b)),
1579   (('bcsel', ('ieq', b, 0), a, ('ishl', a, b)), ('ishl', a, b)),
1580   (('bcsel', ('ine', b, 0), ('ushr', a, b), a), ('ushr', a, b)),
1581   (('bcsel', ('ine', b, 0), ('ishr', a, b), a), ('ishr', a, b)),
1582   (('bcsel', ('ine', b, 0), ('ishl', a, b), a), ('ishl', a, b)),
1583   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'),
1584   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'),
1585   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'),
1586   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'),
1587   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'),
1588   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'),
1589   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'),
1590   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'),
1591   (('urol@8',  a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub',  8, b))), '!options->has_rotate8'),
1592   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'),
1593   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'),
1594   (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))),
1595   (('uror@8',  a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub',  8, b))), '!options->has_rotate8'),
1596   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'),
1597   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'),
1598   (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))),
1599
1600   (('bitfield_select', 0xff000000, ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'),
1601   (('bitfield_select', 0xffff0000, ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'),
1602   (('bitfield_select', 0xffffff00, ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'),
1603   (('ior', ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'),
1604   (('ior', ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'),
1605   (('ior', ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'),
1606   (('bcsel', ('ieq', c, 0), a, ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c))), ('shfr', b, a, c), 'options->has_shfr32'),
1607   (('bcsel', ('ine', c, 0), ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c)), a), ('shfr', b, a, c), 'options->has_shfr32'),
1608   (('ior', ('ishl', 'a@32', ('iadd', 32, ('ineg', b))), ('ushr@32', a, b)), ('shfr', a, a, b), 'options->has_shfr32 && !options->has_rotate32'),
1609
1610   # bfi(X, a, b) = (b & ~X) | (a & X)
1611   # If X = ~0: (b & 0) | (a & 0xffffffff) = a
1612   # If X = 0:  (b & 0xffffffff) | (a & 0) = b
1613   (('bfi', 0xffffffff, a, b), a),
1614   (('bfi', 0x00000000, a, b), b),
1615
1616   # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the
1617   # bfi is either b or c.
1618   (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)),
1619
1620   # bfi(a, 0, 0) = ((0 << find_lsb(a)) & a) | (0 & ~a)
1621   #              = 0
1622   (('bfi', a, 0, 0), 0),
1623
1624   # bfi(a, b, b) = ((b << find_lsb(a)) & a) | (b & ~a)
1625   #              = (a & b) | (b & ~a)    If a is odd, find_lsb(a) == 0
1626   #              = b
1627   (('bfi', '#a(is_odd)', b, b), b),
1628
1629   # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a)
1630   #              = (a & a) | (b & ~a)    If a is odd, find_lsb(a) == 0
1631   #              = a | (b & ~a)
1632   #              = a | b
1633   (('bfi', '#a(is_odd)', a, b), ('ior', a, b)),
1634
1635   # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a)
1636   #              = ((b << find_lsb(a)) & a)
1637   #              = (b & a)               If a is odd, find_lsb(a) == 0
1638   (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)),
1639
1640   # Because 'a' is a positive power of two, the result of the bfi is either 0
1641   # or 'a' depending on whether or not 'b' is odd.  Use 'b&1' for the zero
1642   # value to help platforms that can't have two constants in a bcsel.
1643   (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1644    ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))),
1645   (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1646    ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))),
1647
1648   # Exponential/logarithmic identities
1649   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1650   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1651   # 32-bit fpow should use fmulz to fix https://gitlab.freedesktop.org/mesa/mesa/-/issues/11464 (includes apitrace)
1652   (('fpow@32', a, b), ('fexp2', ('fmulz', ('flog2', a), b)), 'options->lower_fpow && ' + has_fmulz), # a^b = 2^(lg2(a)*b)
1653   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1654   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1655   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1656    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1657   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1658   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1659   (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)),
1660   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1661   (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)),
1662   (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))),
1663   (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))),
1664   (('~fpow', a, 1.0), a),
1665   (('~fpow', a, 2.0), ('fmul', a, a)),
1666   (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)),
1667   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1668   (('~fpow', 2.0, a), ('fexp2', a)),
1669   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1670   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1671   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1672   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1673   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1674   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1675   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1676   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1677   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1678   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1679   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1680   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1681   (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
1682   # Division and reciprocal
1683   (('~fdiv', 1.0, a), ('frcp', a)),
1684   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1685   (('~frcp', ('frcp', a)), a),
1686   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1687   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1688   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1689   # Trig
1690   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1691   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1692   # Boolean simplifications
1693   (('ieq', a, True), a),
1694   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1695   (('ine', a, False), a),
1696   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1697   (('bcsel', a, True, False), a),
1698   (('bcsel', a, False, True), ('inot', a)),
1699   (('bcsel', True, b, c), b),
1700   (('bcsel', False, b, c), c),
1701
1702   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1703   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1704   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1705   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1706   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1707   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1708   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1709   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1710   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1711   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1712   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1713   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1714
1715   (('bcsel', a, b, b), b),
1716   (('~fcsel', a, b, b), b),
1717
1718   # With D3D booleans, imax is AND and umax is OR
1719   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1720    ('ineg', ('b2i', ('iand', a, b)))),
1721   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1722    ('ineg', ('b2i', ('ior', a, b)))),
1723   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1724    ('ineg', ('b2i', ('ior', a, b)))),
1725   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1726    ('ineg', ('b2i', ('iand', a, b)))),
1727   (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior',  a, b))),
1728   (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1729
1730   # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op.
1731   (('iand', ('b2i', a), 1), ('b2i', a)),
1732
1733   (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))),
1734   (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))),
1735
1736   # Conversions
1737   (('f2i', ('ftrunc', a)), ('f2i', a)),
1738   (('f2u', ('ftrunc', a)), ('f2u', a)),
1739
1740   # Conversions from 16 bits to 32 bits and back can always be removed
1741   (('f2fmp', ('f2f32', 'a@16')), a),
1742   (('i2imp', ('i2i32', 'a@16')), a),
1743   (('i2imp', ('u2u32', 'a@16')), a),
1744
1745   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1746   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1747   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1748   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1749
1750   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1751   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1752   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1753
1754   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1755   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1756   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1757   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1758
1759   # Conversions to 16 bits would be lossy so they should only be removed if
1760   # the instruction was generated by the precision lowering pass.
1761   (('f2f32', ('f2fmp', 'a@32')), a),
1762   (('i2i32', ('i2imp', 'a@32')), a),
1763   (('u2u32', ('i2imp', 'a@32')), a),
1764
1765   # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32
1766   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1767   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1768   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1769   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1770
1771   # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32
1772   (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)),
1773   (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)),
1774   (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)),
1775
1776   (('ffloor', 'a(is_integral)'), a),
1777   (('fceil', 'a(is_integral)'), a),
1778   (('ftrunc', 'a(is_integral)'), a),
1779   (('fround_even', 'a(is_integral)'), a),
1780
1781   # fract(x) = x - floor(x), so fract(NaN) = NaN
1782   (('~ffract', 'a(is_integral)'), 0.0),
1783   (('fabs', 'a(is_not_negative)'), a),
1784   (('iabs', 'a(is_not_negative)'), a),
1785   (('fsat', 'a(is_not_positive)'), 0.0),
1786
1787   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1788
1789   # The result of the multiply must be in [-1, 0], so the result of the ffma
1790   # must be in [0, 1].
1791   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1792   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1793   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1794   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1795
1796   (('fneu', 'a(is_not_zero)', 0.0), True),
1797   (('feq', 'a(is_not_zero)', 0.0), False),
1798
1799   # In this chart, + means value > 0 and - means value < 0.
1800   #
1801   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1802   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1803   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1804   #
1805   # Using grouping conceptually similar to a Karnaugh map...
1806   #
1807   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1808   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1809   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1810   #
1811   # The flt / ilt cases just invert the expected result.
1812   #
1813   # The results expecting true, must be marked imprecise.  The results
1814   # expecting false are fine because NaN compared >= or < anything is false.
1815
1816   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1817   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1818   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1819
1820   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1821   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1822   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1823
1824   (('ine', 'a(is_not_zero)', 0), True),
1825   (('ieq', 'a(is_not_zero)', 0), False),
1826
1827   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1828   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1829   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1830
1831   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1832   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1833   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1834
1835   (('ult', 0, 'a(is_gt_zero)'), True),
1836   (('ult', a, 0), False),
1837])
1838
1839# Packing and then unpacking does nothing
1840for pack, bits, compbits in [('pack_64_2x32', 64, 32), ('pack_32_2x16', 32, 16)]:
1841    unpack = 'un' + pack
1842    optimizations += [
1843        ((unpack + '_split_x', (pack + '_split', a, b)), a),
1844        ((unpack + '_split_y', (pack + '_split', a, b)), b),
1845        ((unpack + '_split_x', (pack, a)), 'a.x'),
1846        ((unpack + '_split_y', (pack, a)), 'a.y'),
1847        ((unpack + '_split_x', ('u2u' + str(bits), 'a@' + str(compbits))), a),
1848        ((unpack + '_split_x', ('i2i' + str(bits), 'a@' + str(compbits))), a),
1849        ((unpack + '_split_y', ('i2i' + str(bits) + '(is_used_once)', 'a@' + str(compbits))), ('ishr', a, compbits - 1)),
1850        ((unpack, (pack + '_split', a, b)), ('vec2', a, b)),
1851        ((unpack, (pack, a)), a),
1852        ((pack + '_split', (unpack + '_split_x', a), (unpack + '_split_y', a)), a),
1853        ((pack + '_split', (unpack, a), (unpack + '.y', a)), a),
1854        ((pack, ('vec2', (unpack + '_split_x', a), (unpack + '_split_y', a))), a),
1855        ((pack, (unpack, a)), a),
1856    ]
1857
1858optimizations.extend([
1859   (('unpack_64_2x32_split_y', ('u2u64', 'a@1')), 0),
1860   (('unpack_64_2x32_split_y', ('u2u64', 'a@8')), 0),
1861   (('unpack_64_2x32_split_y', ('u2u64', 'a@16')), 0),
1862   (('unpack_64_2x32_split_y', ('u2u64', 'a@32')), 0), # Don't do that for u64 -> u64
1863   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1864   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1865
1866   (('unpack_64_4x16', ('pack_64_4x16', a)), a),
1867   (('pack_64_4x16', ('unpack_64_4x16', a)), a),
1868   (('unpack_32_4x8', ('pack_32_4x8', a)), a),
1869   (('pack_32_4x8', ('unpack_32_4x8', a)), a),
1870
1871   (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)),
1872   (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)),
1873
1874   (('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)),
1875    ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'),
1876   (('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))),
1877    ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'),
1878   (('pack_64_2x32', ('vec2', ('pack_32_2x16', ('vec2', a, b)), ('pack_32_2x16', ('vec2', c, d)))),
1879    ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'),
1880
1881   # Comparing two halves of an unpack separately.  While this optimization
1882   # should be correct for non-constant values, it's less obvious that it's
1883   # useful in that case.  For constant values, the pack will fold and we're
1884   # guaranteed to reduce the whole tree to one instruction.
1885   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1886             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1887    ('ieq', a, ('pack_32_2x16_split', b, c))),
1888
1889   # Byte extraction
1890   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1891   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1892   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1893   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1894   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1895   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1896   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1897   (('ishr', ('iand', 'a@32', 0x0000ff00),  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1898   (('ishr', ('iand', 'a@64', 0x0000ff00),  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1899   (('ishr', ('iand',  a,     0x00ff0000), 16), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1900
1901   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1902   # storage buffer.
1903   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1904   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1905
1906   # Common pattern after lowering 8-bit integers to 16-bit.
1907   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1908   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1909
1910   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1911   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1912   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1913   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1914   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1915   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1916   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1917   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1918
1919   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1920   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1921
1922   # The extract_X8(a & 0xff) patterns aren't included because the iand will
1923   # already be converted to extract_u8.
1924   (('extract_i8', ('iand', a, 0x0000ff00), 1), ('extract_i8', a, 1)),
1925   (('extract_i8', ('iand', a, 0x00ff0000), 2), ('extract_i8', a, 2)),
1926   (('extract_i8', ('iand', a, 0xff000000), 3), ('extract_i8', a, 3)),
1927
1928   (('extract_u8', ('iand', a, 0x0000ff00), 1), ('extract_u8', a, 1)),
1929   (('extract_u8', ('iand', a, 0x00ff0000), 2), ('extract_u8', a, 2)),
1930   (('extract_u8', ('iand', a, 0xff000000), 3), ('extract_u8', a, 3)),
1931
1932   (('iand', ('extract_u8',  a, 0), '#b'), ('iand', a, ('iand', b, 0x00ff))),
1933   (('iand', ('extract_u16', a, 0), '#b'), ('iand', a, ('iand', b, 0xffff))),
1934
1935   (('ieq', ('iand', ('extract_u8',  a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b),  8))), 0)),
1936   (('ine', ('iand', ('extract_u8',  a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b),  8))), 0)),
1937   (('ieq', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)),
1938   (('ine', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)),
1939
1940    # Word extraction
1941   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1942   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1943   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1944   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1945   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1946
1947   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1948   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1949   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1950   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1951
1952   # Packing a u8vec4 to write to an SSBO.
1953   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1954    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1955
1956   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1957   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1958
1959   # The extract_X16(a & 0xff) patterns aren't included because the iand will
1960   # already be converted to extract_u8.
1961   (('extract_i16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), # extract_u8 is correct
1962   (('extract_u16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1963
1964   # Lower pack/unpack
1965   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1966   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'),
1967   (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'),
1968   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1969   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1970   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1971   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1972
1973   (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1974   (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1975
1976   # Useless masking before unpacking
1977   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1978   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1979   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1980   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1981   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1982   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1983
1984   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1985   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1986   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1987   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1988   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1989
1990   # Optimize half packing
1991   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1992   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1993
1994   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1995    ('pack_half_2x16', ('vec2', a, b))),
1996   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1997    ('pack_half_2x16', ('vec2', a, b))),
1998
1999   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
2000   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
2001   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
2002
2003   (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)),
2004   (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)),
2005   (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)),
2006
2007   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
2008   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
2009
2010   (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
2011   (('ior',  ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
2012
2013   (('pack_uint_2x16', ('vec2', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', b, 0))), ('pack_half_2x16_rtz_split', a, b)),
2014
2015   (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)),
2016    ('pack_half_2x16_split', c, a)),
2017
2018   # The important part here is that ~0xf & 0xfffffffc = ~0xf.
2019   (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc),
2020    ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)),
2021   (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc),
2022    ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)),
2023
2024   # 0x0f << 3 == 0x78, so that's already the maximum possible value.
2025   (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)),
2026
2027   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
2028   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
2029   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
2030   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
2031   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
2032   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
2033   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
2034   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
2035
2036   # Reduce intermediate precision with int64.
2037   (('u2u32', ('iadd(is_used_once)', 'a@64', b)),
2038    ('iadd', ('u2u32', a), ('u2u32', b))),
2039
2040   (('u2u32', ('imul(is_used_once)', 'a@64', b)),
2041    ('imul', ('u2u32', a), ('u2u32', b))),
2042
2043   (('u2f32', ('u2u64', 'a@32')), ('u2f32', a)),
2044
2045   # UINT32_MAX < a just checks the high half of a 64-bit value. This occurs
2046   # when lowering convert_uint_sat(ulong). Although the replacement is more
2047   # instructions, it replaces a 64-bit instruction with a 32-bit instruction
2048   # and a move that will likely be coalesced.
2049   (('ult', 0xffffffff, 'a@64'), ('ine', ('unpack_64_2x32_split_y', a), 0)),
2050
2051   # Redundant trip through 8-bit
2052   (('i2i16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)),
2053   (('u2u16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)),
2054
2055   # Reduce 16-bit integers to 1-bit booleans, hit with OpenCL. In turn, this
2056   # lets iand(b2i1(...), 1) get simplified. Backends can usually fuse iand/inot
2057   # so this should be no worse when it isn't strictly better.
2058   (('bcsel', a, 0, ('b2i16', 'b@1')), ('b2i16', ('iand', ('inot', a), b))),
2059   (('bcsel', a, ('b2i16', 'b@1'), ('b2i16', 'c@1')), ('b2i16', ('bcsel', a, b, c))),
2060
2061   # Lowered pack followed by lowered unpack, for the high bits
2062   (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@8')), 32)), ('u2u32', a)),
2063   (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@16')), 32)), ('u2u32', a)),
2064   (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@32')), 32)), ('u2u32', a)),
2065   (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@8')), 16)), ('u2u16', a)),
2066   (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@16')), 16)), ('u2u16', a)),
2067])
2068
2069# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
2070# patterns like those below.
2071for op in ('ushr', 'ishr'):
2072   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
2073   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
2074   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
2075
2076optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
2077
2078# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
2079# patterns like those below.
2080for op in ('extract_u8', 'extract_i8'):
2081   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
2082   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
2083   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
2084
2085for op, repl in [('ieq', 'ieq'), ('ine', 'ine'),
2086                 ('ult', 'ult'), ('ilt', 'ult'),
2087                 ('uge', 'uge'), ('ige', 'uge')]:
2088   optimizations.extend([
2089      ((op, ('pack_64_2x32_split', a, 0), ('pack_64_2x32_split', b, 0)), (repl, a, b)),
2090      ((op, ('pack_64_2x32_split', a, 0), '#b(is_upper_half_zero)'), (repl, a, ('unpack_64_2x32_split_x', b))),
2091      ((op, '#a(is_upper_half_zero)', ('pack_64_2x32_split', b, 0)), (repl, ('unpack_64_2x32_split_x', a), b)),
2092
2093      ((op, ('pack_64_2x32_split', 0, a), ('pack_64_2x32_split', 0, b)), (op, a, b)),
2094      ((op, ('pack_64_2x32_split', 0, a), '#b(is_lower_half_zero)'), (op, a, ('unpack_64_2x32_split_y', b))),
2095      ((op, '#a(is_lower_half_zero)', ('pack_64_2x32_split', 0, b)), (op, ('unpack_64_2x32_split_y', a), b)),
2096   ])
2097
2098optimizations.extend([
2099   # Subtracts
2100   (('ussub_4x8_vc4', a, 0), a),
2101   (('ussub_4x8_vc4', a, ~0), 0),
2102   # Lower all Subtractions first - they can get recombined later
2103   (('fsub', a, b), ('fadd', a, ('fneg', b))),
2104   (('isub', a, b), ('iadd', a, ('ineg', b))),
2105   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
2106   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
2107   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
2108   (('bitz', a, b), ('inot', ('bitnz', a, b))),
2109
2110   # Propagate negation up multiplication chains
2111   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
2112   (('fmulz(is_used_by_non_fsat,nsz)', ('fneg', a), b), ('fneg', ('fmulz', a, b))),
2113   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
2114   (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
2115   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
2116
2117   # Propagate constants up multiplication chains
2118   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
2119   (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
2120   (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
2121   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
2122   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
2123   (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
2124   (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
2125   # Prefer moving out a multiplication for more MAD/FMA-friendly code
2126   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
2127   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
2128   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
2129   (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
2130   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
2131
2132   # Reassociate constants in add/mul chains so they can be folded together.
2133   # For now, we mostly only handle cases where the constants are separated by
2134   # a single non-constant.  We could do better eventually.
2135   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
2136   (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
2137   (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
2138   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
2139   (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
2140   (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
2141   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
2142   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
2143   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
2144   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
2145   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
2146   (('~fadd', '#a',          ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffmaz',          b,  c, ('fadd', a,          d))),
2147   (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
2148   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
2149   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
2150   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
2151   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
2152   (('ior', ('iand', a, '#c'), ('ior', b, ('iand', a, '#d'))), ('ior', b, ('iand', a, ('ior', c, d)))),
2153
2154   # Reassociate add chains for more MAD/FMA-friendly code
2155   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
2156
2157   # Drop mul-div by the same value when there's no wrapping.
2158   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
2159
2160   # By definition...
2161   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
2162   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
2163   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
2164   (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2165   (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
2166
2167   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
2168   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
2169   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
2170   (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2171   (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
2172
2173   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
2174   (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2175
2176   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2177   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2178   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2179   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2180   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2181   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2182   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2183   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2184   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2185   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2186   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2187   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2188
2189   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'),
2190   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
2191   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
2192   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2193   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'),
2194   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
2195   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
2196   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2197   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
2198   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2199   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
2200   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2201
2202   # Clear the LSB
2203   (('iand', a, ('inot', ('ishl', 1, ('find_lsb', a)))), ('iand', a, ('inot', ('ineg', a)))),
2204
2205   # This is safe. Both ufind_msb_rev and bitfield_reverse can only have
2206   # 32-bit sources, so the transformation can only generate correct NIR.
2207   (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2208   (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'),
2209
2210   (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))),
2211   (('ifind_msb', ('extract_u8', a, b)),       ('ufind_msb', ('extract_u8', a, b))),
2212   (('ifind_msb', ('extract_u16', a, b)),      ('ufind_msb', ('extract_u16', a, b))),
2213   (('ifind_msb', ('imax', a, 1)),             ('ufind_msb', ('imax', a, 1))),
2214
2215   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
2216   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
2217   (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
2218   (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
2219   (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)),
2220   (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)),
2221   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
2222
2223   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
2224   (('bcsel', a, ('bcsel(is_used_once)', b, d, c), d), ('bcsel', ('iand', a, ('inot', b)), c, d)),
2225   (('bcsel', a, b, ('bcsel(is_used_once)', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
2226   (('bcsel', a, b, ('bcsel(is_used_once)', c, d, b)), ('bcsel', ('iand', c, ('inot', a)), d, b)),
2227
2228   # Misc. lowering
2229   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
2230   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
2231   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
2232   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
2233
2234   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2235    ('bcsel', ('ult', 31, 'bits'), 'insert',
2236              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
2237    'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'),
2238   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2239   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2240   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2241   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2242   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2243   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2244   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2245   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2246
2247   (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'),
2248   (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'),
2249
2250   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)),
2251    'options->lower_uadd_sat || (options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64)) != 0'),
2252   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
2253   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'),
2254   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),
2255
2256   # int64_t sum = a + b;
2257   #
2258   # if (a < 0 && b < 0 && a < sum)
2259   #    sum = INT64_MIN;
2260   # } else if (a >= 0 && b >= 0 && sum < a)
2261   #    sum = INT64_MAX;
2262   # }
2263   #
2264   # A couple optimizations are applied.
2265   #
2266   # 1. a < sum => sum >= 0.  This replacement works because it is known that
2267   #    a < 0 and b < 0, so sum should also be < 0 unless there was
2268   #    underflow.
2269   #
2270   # 2. sum < a => sum < 0.  This replacement works because it is known that
2271   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
2272   #    overflow.
2273   #
2274   # 3. Invert the second if-condition and swap the order of parameters for
2275   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
2276   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
2277   #
2278   # On Intel Gen11, this saves ~11 instructions.
2279   (('iadd_sat@64', a, b), ('bcsel',
2280                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
2281                            0x8000000000000000,
2282                            ('bcsel',
2283                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
2284                             ('iadd', a, b),
2285                             0x7fffffffffffffff)),
2286    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
2287
2288   # int64_t sum = a - b;
2289   #
2290   # if (a < 0 && b >= 0 && a < sum)
2291   #    sum = INT64_MIN;
2292   # } else if (a >= 0 && b < 0 && a >= sum)
2293   #    sum = INT64_MAX;
2294   # }
2295   #
2296   # Optimizations similar to the iadd_sat case are applied here.
2297   (('isub_sat@64', a, b), ('bcsel',
2298                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
2299                            0x8000000000000000,
2300                            ('bcsel',
2301                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
2302                             ('isub', a, b),
2303                             0x7fffffffffffffff)),
2304    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
2305
2306   # These are done here instead of in the backend because the int64 lowering
2307   # pass will make a mess of the patterns.  The first patterns are
2308   # conditioned on nir_lower_minmax64 because it was not clear that it was
2309   # always an improvement on platforms that have real int64 support.  No
2310   # shaders in shader-db hit this, so it was hard to say one way or the
2311   # other.
2312   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2313   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2314   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2315   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2316   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2317   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2318
2319   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2320   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2321   # 0u < uint(a) <=> uint(a) != 0u
2322   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2323
2324   # Alternative lowering that doesn't rely on bfi.
2325   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2326    ('bcsel', ('ult', 31, 'bits'),
2327     'insert',
2328    (('ior',
2329     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
2330     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
2331    'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'),
2332
2333   # Alternative lowering that uses bitfield_select.
2334   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2335    ('bcsel', ('ult', 31, 'bits'), 'insert',
2336              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
2337    'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'),
2338
2339   (('ibitfield_extract', 'value', 'offset', 'bits'),
2340    ('bcsel', ('ult', 31, 'bits'), 'value',
2341              ('ibfe', 'value', 'offset', 'bits')),
2342    'options->lower_bitfield_extract && options->has_bfe'),
2343
2344   (('ubitfield_extract', 'value', 'offset', 'bits'),
2345    ('bcsel', ('ult', 31, 'bits'), 'value',
2346              ('ubfe', 'value', 'offset', 'bits')),
2347    'options->lower_bitfield_extract && options->has_bfe'),
2348
2349   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
2350   (('bitfield_select', a, b, 0), ('iand', a, b)),
2351   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
2352
2353   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
2354   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
2355   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
2356   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
2357   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
2358   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
2359   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
2360
2361   # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such
2362   (('ult', a, ('umin', ('iand', a, b), c)), False),
2363   (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False),
2364   (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2365    ('ubfe', 'value', 'offset', 'width')),
2366   (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2367    ('ibfe', 'value', 'offset', 'width')),
2368   (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'),
2369    ('bfm', 'width', 'offset')),
2370
2371   # open-coded BFM
2372   (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'),
2373   (('ishl', ('bfm', a, 0), b), ('bfm', a, b)),
2374
2375   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2376   #
2377   #    If bits is zero, the result will be zero.
2378   #
2379   # These patterns prevent other patterns from generating invalid results
2380   # when count is zero.
2381   (('ubfe', a, b, 0), 0),
2382   (('ibfe', a, b, 0), 0),
2383
2384   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
2385
2386   (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)),
2387   (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
2388   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2389   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2390   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2391   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2392
2393   (('ibitfield_extract', 'value', 'offset', 'bits'),
2394    ('bcsel', ('ieq', 0, 'bits'),
2395     0,
2396     ('ishr',
2397       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
2398       ('isub', 32, 'bits'))),
2399    'options->lower_bitfield_extract && !options->has_bfe'),
2400
2401   (('ubitfield_extract', 'value', 'offset', 'bits'),
2402    ('iand',
2403     ('ushr', 'value', 'offset'),
2404     ('bcsel', ('ieq', 'bits', 32),
2405      0xffffffff,
2406      ('isub', ('ishl', 1, 'bits'), 1))),
2407    'options->lower_bitfield_extract && !options->has_bfe'),
2408
2409   (('ifind_msb', 'value'),
2410    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
2411    'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'),
2412
2413   (('ifind_msb', 'value'),
2414    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
2415     ('isub', 31, ('ifind_msb_rev', 'value')),
2416     ('ifind_msb_rev', 'value')),
2417    'options->lower_ifind_msb && options->has_find_msb_rev'),
2418
2419   # uclz of an absolute value source almost always does the right thing.
2420   # There are a couple problem values:
2421   #
2422   # * 0x80000000.  Since abs(0x80000000) == 0x80000000, uclz returns 0.
2423   #   However, findMSB(int(0x80000000)) == 30.
2424   #
2425   # * 0xffffffff.  Since abs(0xffffffff) == 1, uclz returns 31.  Section 8.8
2426   #   (Integer Functions) of the GLSL 4.50 spec says:
2427   #
2428   #    For a value of zero or negative one, -1 will be returned.
2429   #
2430   # * Negative powers of two.  uclz(abs(-(1<<x))) returns x, but
2431   #   findMSB(-(1<<x)) should return x-1.
2432   #
2433   # For all negative number cases, including 0x80000000 and 0xffffffff, the
2434   # correct value is obtained from uclz if instead of negating the (already
2435   # negative) value the logical-not is used.  A conditional logical-not can
2436   # be achieved by (x ^ (x >> 31)).
2437   (('ifind_msb', 'value'),
2438    ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))),
2439    'options->lower_ifind_msb && options->has_uclz'),
2440
2441   (('ufind_msb', 'value@32'),
2442    ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
2443     ('isub', 31, ('ufind_msb_rev', 'value')),
2444     ('ufind_msb_rev', 'value')),
2445    'options->lower_ufind_msb && options->has_find_msb_rev'),
2446
2447   (('ufind_msb', 'value@32'),
2448    ('isub', 31, ('uclz', 'value')),
2449    'options->lower_ufind_msb && options->has_uclz'),
2450
2451   (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'),
2452
2453   (('find_lsb', 'value@64'),
2454    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
2455    'options->lower_find_lsb'),
2456
2457   (('find_lsb', 'value'),
2458    ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))),
2459    'options->lower_find_lsb'),
2460
2461   (('extract_i8', a, 'b@32'),
2462    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
2463    'options->lower_extract_byte'),
2464
2465   (('extract_u8', a, 'b@32'),
2466    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
2467    'options->lower_extract_byte'),
2468
2469   (('extract_i16', a, 'b@32'),
2470    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
2471    'options->lower_extract_word'),
2472
2473   (('extract_u16', a, 'b@32'),
2474    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
2475    'options->lower_extract_word'),
2476
2477    (('pack_unorm_2x16', 'v'),
2478     ('pack_uvec2_to_uint',
2479        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
2480     'options->lower_pack_unorm_2x16'),
2481
2482    (('pack_unorm_4x8', 'v'),
2483     ('pack_uvec4_to_uint',
2484        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
2485     'options->lower_pack_unorm_4x8 && !options->has_pack_32_4x8'),
2486
2487    (('pack_unorm_4x8', 'v'),
2488     ('pack_32_4x8',
2489        ('f2u8', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
2490     'options->lower_pack_unorm_4x8 && options->has_pack_32_4x8'),
2491
2492    (('pack_snorm_2x16', 'v'),
2493     ('pack_uvec2_to_uint',
2494        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
2495     'options->lower_pack_snorm_2x16'),
2496
2497    (('pack_snorm_4x8', 'v'),
2498     ('pack_uvec4_to_uint',
2499        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
2500     'options->lower_pack_snorm_4x8 && !options->has_pack_32_4x8'),
2501
2502    (('pack_snorm_4x8', 'v'),
2503     ('pack_32_4x8',
2504        ('f2i8', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
2505     'options->lower_pack_snorm_4x8 && options->has_pack_32_4x8'),
2506
2507    (('unpack_unorm_2x16', 'v'),
2508     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
2509                                  ('extract_u16', 'v', 1))),
2510              65535.0),
2511     'options->lower_unpack_unorm_2x16'),
2512
2513    (('unpack_unorm_4x8', 'v'),
2514     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
2515                                  ('extract_u8', 'v', 1),
2516                                  ('extract_u8', 'v', 2),
2517                                  ('extract_u8', 'v', 3))),
2518              255.0),
2519     'options->lower_unpack_unorm_4x8'),
2520
2521    (('unpack_snorm_2x16', 'v'),
2522     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
2523                                                            ('extract_i16', 'v', 1))),
2524                                           32767.0))),
2525     'options->lower_unpack_snorm_2x16'),
2526
2527    (('unpack_snorm_4x8', 'v'),
2528     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
2529                                                            ('extract_i8', 'v', 1),
2530                                                            ('extract_i8', 'v', 2),
2531                                                            ('extract_i8', 'v', 3))),
2532                                           127.0))),
2533     'options->lower_unpack_snorm_4x8'),
2534
2535   (('pack_half_2x16_split', 'a@32', 'b@32'),
2536    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
2537    'options->lower_pack_split'),
2538
2539   (('unpack_half_2x16_split_x', 'a@32'),
2540    ('f2f32', ('u2u16', a)),
2541    'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2542
2543   (('unpack_half_2x16_split_x', 'a@32'),
2544    ('f2f32', ('fmul', 1.0, ('u2u16', a))),
2545    'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2546
2547   (('unpack_half_2x16_split_y', 'a@32'),
2548    ('f2f32', ('u2u16', ('ushr', a, 16))),
2549    'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2550
2551   (('unpack_half_2x16_split_y', 'a@32'),
2552    ('f2f32', ('fmul', 1.0, ('u2u16', ('ushr', a, 16)))),
2553    'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2554
2555   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
2556   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
2557   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
2558   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
2559   # Mark the new comparisons precise to prevent them being changed to 'a !=
2560   # 0' or 'a == 0'.
2561   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
2562   (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'),
2563
2564   # Address/offset calculations:
2565   # Drivers supporting imul24 should use a pass like nir_lower_amul(), this
2566   # rule converts everyone else to imul:
2567   (('amul', a, b), ('imul', a, b), '!options->has_imul24 && !options->has_amul'),
2568
2569   # udiv_aligned_4 assumes the source is a multiple of 4 specifically to enable
2570   # this identity. Usually this transform would require masking.
2571   (('amul', ('udiv_aligned_4', a), 4), a),
2572   (('imul', ('udiv_aligned_4', a), 4), a),
2573
2574   (('umul24', a, b),
2575    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
2576    '!options->has_umul24'),
2577   (('umad24', a, b, c),
2578    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
2579    '!options->has_umad24'),
2580
2581   # Relaxed 24bit ops
2582   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
2583   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
2584   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
2585   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
2586   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
2587   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
2588
2589   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
2590   (('imad24_ir3', a, 0, c), (c)),
2591   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
2592
2593   # if first two srcs are const, crack apart the imad so constant folding
2594   # can clean up the imul:
2595   # TODO ffma should probably get a similar rule:
2596   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
2597
2598   # These will turn 24b address/offset calc back into 32b shifts, but
2599   # it should be safe to get back some of the bits of precision that we
2600   # already decided were no necessary:
2601   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
2602   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
2603   (('imul24', a, 0), (0)),
2604])
2605
2606for bit_size in [8, 16, 32, 64]:
2607   cond = '!options->lower_uadd_sat'
2608   if bit_size == 64:
2609      cond += ' && !(options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64))'
2610   add = 'iadd@' + str(bit_size)
2611
2612   optimizations += [
2613      (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2614      (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond),
2615      (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond),
2616      (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2617   ]
2618
2619for bit_size in [8, 16, 32, 64]:
2620   cond = '!options->lower_usub_sat'
2621   if bit_size == 64:
2622      cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)'
2623   add = 'iadd@' + str(bit_size)
2624
2625   optimizations += [
2626      (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2627      (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2628      (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2629      (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2630   ]
2631
2632# bit_size dependent lowerings
2633for bit_size in [8, 16, 32, 64]:
2634   # convenience constants
2635   intmax = (1 << (bit_size - 1)) - 1
2636   intmin = 1 << (bit_size - 1)
2637
2638   optimizations += [
2639      (('iadd_sat@' + str(bit_size), a, b),
2640       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
2641                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
2642      (('isub_sat@' + str(bit_size), a, b),
2643       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
2644                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
2645   ]
2646
2647invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
2648
2649for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
2650   optimizations.append((('inot', ('ior(is_used_once)', (left + '(is_used_once)', a, b),
2651                                                        (right + '(is_used_once)', c, d))),
2652                         ('iand', (invert[left], a, b), (invert[right], c, d))))
2653   optimizations.append((('inot', ('iand(is_used_once)', (left + '(is_used_once)', a, b),
2654                                                         (right + '(is_used_once)', c, d))),
2655                         ('ior', (invert[left], a, b), (invert[right], c, d))))
2656
2657# Optimize x2yN(b2x(x)) -> b2y
2658for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
2659   if x != 'f' and y != 'f' and x != y:
2660      continue
2661
2662   b2x = 'b2f' if x == 'f' else 'b2i'
2663   b2y = 'b2f' if y == 'f' else 'b2i'
2664   x2yN = '{}2{}'.format(x, y)
2665   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
2666
2667# Optimize away x2xN(a@N)
2668for t in ['int', 'uint', 'float', 'bool']:
2669   for N in type_sizes(t):
2670      x2xN = '{0}2{0}{1}'.format(t[0], N)
2671      aN = 'a@{0}'.format(N)
2672      optimizations.append(((x2xN, aN), a))
2673
2674# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
2675# In particular, we can optimize away everything except upcast of downcast and
2676# upcasts where the type differs from the other cast
2677for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
2678   if N < M:
2679      # The outer cast is a down-cast.  It doesn't matter what the size of the
2680      # argument of the inner cast is because we'll never been in the upcast
2681      # of downcast case.  Regardless of types, we'll always end up with y2yN
2682      # in the end.
2683      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
2684         x2xN = '{0}2{0}{1}'.format(x, N)
2685         y2yM = '{0}2{0}{1}'.format(y, M)
2686         y2yN = '{0}2{0}{1}'.format(y, N)
2687         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
2688   elif N > M:
2689      # If the outer cast is an up-cast, we have to be more careful about the
2690      # size of the argument of the inner cast and with types.  In this case,
2691      # the type is always the type of type up-cast which is given by the
2692      # outer cast.
2693      for P in type_sizes('uint'):
2694         # We can't optimize away up-cast of down-cast.
2695         if M < P:
2696            continue
2697
2698         # Because we're doing down-cast of down-cast, the types always have
2699         # to match between the two casts
2700         for x in ['i', 'u']:
2701            x2xN = '{0}2{0}{1}'.format(x, N)
2702            x2xM = '{0}2{0}{1}'.format(x, M)
2703            aP = 'a@{0}'.format(P)
2704            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
2705   else:
2706      # The N == M case is handled by other optimizations
2707      pass
2708
2709# Downcast operations should be able to see through pack
2710for t in ['i', 'u']:
2711    for N in [8, 16, 32]:
2712        x2xN = '{0}2{0}{1}'.format(t, N)
2713        optimizations += [
2714            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2715            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2716        ]
2717
2718# Optimize comparisons with up-casts
2719for t in ['int', 'uint', 'float']:
2720    for N, M in itertools.product(type_sizes(t), repeat=2):
2721        if N == 1 or N >= M:
2722            continue
2723
2724        cond = 'true'
2725        if N == 8:
2726            cond = 'options->support_8bit_alu'
2727        elif N == 16:
2728            cond = 'options->support_16bit_alu'
2729        x2xM = '{0}2{0}{1}'.format(t[0], M)
2730        x2xN = '{0}2{0}{1}'.format(t[0], N)
2731        aN = 'a@' + str(N)
2732        bN = 'b@' + str(N)
2733        xeq = 'feq' if t == 'float' else 'ieq'
2734        xne = 'fneu' if t == 'float' else 'ine'
2735        xge = '{0}ge'.format(t[0])
2736        xlt = '{0}lt'.format(t[0])
2737
2738        # Up-casts are lossless so for correctly signed comparisons of
2739        # up-casted values we can do the comparison at the largest of the two
2740        # original sizes and drop one or both of the casts.  (We have
2741        # optimizations to drop the no-op casts which this may generate.)
2742        for P in type_sizes(t):
2743            if P == 1 or P > N:
2744                continue
2745
2746            bP = 'b@' + str(P)
2747            optimizations += [
2748                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2749                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2750                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2751                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2752                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2753                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2754            ]
2755
2756        # The next bit doesn't work on floats because the range checks would
2757        # get way too complicated.
2758        if t in ['int', 'uint']:
2759            if t == 'int':
2760                xN_min = -(1 << (N - 1))
2761                xN_max = (1 << (N - 1)) - 1
2762            elif t == 'uint':
2763                xN_min = 0
2764                xN_max = (1 << N) - 1
2765            else:
2766                assert False
2767
2768            # If we're up-casting and comparing to a constant, we can unfold
2769            # the comparison into a comparison with the shrunk down constant
2770            # and a check that the constant fits in the smaller bit size.
2771            optimizations += [
2772                ((xeq, (x2xM, aN), '#b'),
2773                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2774                ((xne, (x2xM, aN), '#b'),
2775                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2776                ((xlt, (x2xM, aN), '#b'),
2777                 ('iand', (xlt, xN_min, b),
2778                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2779                ((xlt, '#a', (x2xM, bN)),
2780                 ('iand', (xlt, a, xN_max),
2781                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2782                ((xge, (x2xM, aN), '#b'),
2783                 ('iand', (xge, xN_max, b),
2784                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2785                ((xge, '#a', (x2xM, bN)),
2786                 ('iand', (xge, a, xN_min),
2787                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2788            ]
2789
2790# Convert masking followed by signed downcast to just unsigned downcast
2791optimizations += [
2792    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2793    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2794    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2795    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2796    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2797    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2798]
2799
2800# Some operations such as iadd have the property that the bottom N bits of the
2801# output only depends on the bottom N bits of each of the inputs so we can
2802# remove casts
2803for N in [16, 32]:
2804    for M in [8, 16]:
2805        if M >= N:
2806            continue
2807
2808        aN = 'a@' + str(N)
2809        u2uM = 'u2u{0}'.format(M)
2810        i2iM = 'i2i{0}'.format(M)
2811
2812        for x in ['u', 'i']:
2813            x2xN = '{0}2{0}{1}'.format(x, N)
2814            extract_xM = 'extract_{0}{1}'.format(x, M)
2815
2816            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2817            extract_xM_M_bits = \
2818                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2819            optimizations += [
2820                ((x2xN_M_bits, (u2uM, aN)), a),
2821                ((extract_xM_M_bits, aN, 0), a),
2822            ]
2823
2824            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2825            optimizations += [
2826                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2827                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2828                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2829            ]
2830
2831            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2832                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2833                optimizations += [
2834                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2835                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2836                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2837                ]
2838
2839def fexp2i(exp, bits):
2840   # Generate an expression which constructs value 2.0^exp or 0.0.
2841   #
2842   # We assume that exp is already in a valid range:
2843   #
2844   #   * [-15, 15] for 16-bit float
2845   #   * [-127, 127] for 32-bit float
2846   #   * [-1023, 1023] for 16-bit float
2847   #
2848   # If exp is the lowest value in the valid range, a value of 0.0 is
2849   # constructed.  Otherwise, the value 2.0^exp is constructed.
2850   if bits == 16:
2851      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2852   elif bits == 32:
2853      return ('ishl', ('iadd', exp, 127), 23)
2854   elif bits == 64:
2855      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2856   else:
2857      assert False
2858
2859def ldexp(f, exp, bits):
2860   # The maximum possible range for a normal exponent is [-126, 127] and,
2861   # throwing in denormals, you get a maximum range of [-149, 127].  This
2862   # means that we can potentially have a swing of +-276.  If you start with
2863   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2864   # all the way to zero.  The GLSL spec only requires that we handle a subset
2865   # of this range.  From version 4.60 of the spec:
2866   #
2867   #    "If exp is greater than +128 (single-precision) or +1024
2868   #    (double-precision), the value returned is undefined. If exp is less
2869   #    than -126 (single-precision) or -1022 (double-precision), the value
2870   #    returned may be flushed to zero. Additionally, splitting the value
2871   #    into a significand and exponent using frexp() and then reconstructing
2872   #    a floating-point value using ldexp() should yield the original input
2873   #    for zero and all finite non-denormalized values."
2874   #
2875   # The SPIR-V spec has similar language.
2876   #
2877   # In order to handle the maximum value +128 using the fexp2i() helper
2878   # above, we have to split the exponent in half and do two multiply
2879   # operations.
2880   #
2881   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2882   # twice the full range that is valid for the fexp2i() function above.  If
2883   # exp/2 is the bottom value of that range, the fexp2i() expression will
2884   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2885   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2886   # value is clamped from above, then it must have been above the supported
2887   # range of the GLSL built-in and therefore any return value is acceptable.
2888   if bits == 16:
2889      exp = ('imin', ('imax', exp, -30), 30)
2890   elif bits == 32:
2891      exp = ('imin', ('imax', exp, -254), 254)
2892   elif bits == 64:
2893      exp = ('imin', ('imax', exp, -2046), 2046)
2894   else:
2895      assert False
2896
2897   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2898   # (We use ishr which isn't the same for -1, but the -1 case still works
2899   # since we use exp-exp/2 as the second exponent.)  While the spec
2900   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2901   # work with denormals and doesn't allow for the full swing in exponents
2902   # that you can get with normalized values.  Instead, we create two powers
2903   # of two and multiply by them each in turn.  That way the effective range
2904   # of our exponent is doubled.
2905   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2906   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2907   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2908
2909optimizations += [
2910   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2911   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2912   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2913]
2914
2915# XCOM 2 (OpenGL) open-codes bitfieldReverse()
2916def bitfield_reverse_xcom2(u):
2917    step1 = ('iadd', ('ishl', u, 16), ('ushr', u, 16))
2918    step2 = ('iadd', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2919    step3 = ('iadd', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2920    step4 = ('iadd', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2921    step5 = ('iadd(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2922
2923    return step5
2924
2925# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2926def bitfield_reverse_ue4(u):
2927    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2928    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2929    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2930    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2931    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2932
2933    return step5
2934
2935# Cyberpunk 2077 open-codes bitfieldReverse()
2936def bitfield_reverse_cp2077(u):
2937    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2938    step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2939    step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2940    step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2941    step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2942
2943    return step5
2944
2945optimizations += [(bitfield_reverse_xcom2('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2946optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2947optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2948
2949# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16.
2950# Because the spec does not specify a rounding mode or behaviour regarding infinity,
2951# it emits a sequence to ensure D3D-like behaviour for infinity.
2952# When we know the current backend already behaves like we need, we can eliminate the extra sequence.
2953#
2954# Input is f32, output is u32 that has the f16 packed into its low bits.
2955def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a):
2956    packed_half = ('pack_half_2x16_rtz_split', a, 0)
2957    packed_half_minus1 = ('iadd', packed_half, 0xffffffff)
2958    f32_was_not_inf = ('ine', abs_a, 0x7f800000)
2959    f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00)
2960    return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half)
2961
2962optimizations += [
2963   (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)),
2964   (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)),
2965   (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)),
2966]
2967
2968def vkd3d_proton_msad():
2969   pattern = None
2970   for i in range(4):
2971      ref = ('extract_u8', 'a@32', i)
2972      src = ('extract_u8', 'b@32', i)
2973      sad = ('iabs', ('iadd', ref, ('ineg', src)))
2974      msad = ('bcsel', ('ieq', ref, 0), 0, sad)
2975      if pattern == None:
2976         pattern = msad
2977      else:
2978         pattern = ('iadd', pattern, msad)
2979   pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:])
2980   return pattern
2981
2982optimizations += [
2983   (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'),
2984   (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)),
2985]
2986
2987
2988# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2989# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2990for ncomp in [2, 3, 4, 8, 16]:
2991   optimizations += [
2992      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2993      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2994      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2995      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2996   ]
2997
2998# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2999# then the "a == a" is redundant because it's equivalent to "a is not NaN"
3000# and, if a is a NaN then the second comparison will fail anyway.
3001for op in ['flt', 'fge', 'feq']:
3002   optimizations += [
3003      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
3004      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
3005   ]
3006
3007# Add optimizations to handle the case where the result of a ternary is
3008# compared to a constant.  This way we can take things like
3009#
3010# (a ? 0 : 1) > 0
3011#
3012# and turn it into
3013#
3014# a ? (0 > 0) : (1 > 0)
3015#
3016# which constant folding will eat for lunch.  The resulting ternary will
3017# further get cleaned up by the boolean reductions above and we will be
3018# left with just the original variable "a".
3019for op in ['feq', 'fneu', 'ieq', 'ine']:
3020   optimizations += [
3021      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
3022       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
3023   ]
3024
3025for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
3026   optimizations += [
3027      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
3028       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
3029      ((op, '#d', ('bcsel', a, '#b', '#c')),
3030       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
3031   ]
3032
3033
3034# For example, this converts things like
3035#
3036#    1 + mix(0, a - 1, condition)
3037#
3038# into
3039#
3040#    mix(1, (a-1)+1, condition)
3041#
3042# Other optimizations will rearrange the constants.
3043for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
3044   optimizations += [
3045      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
3046   ]
3047
3048# Some optimizations for ir3-specific instructions.
3049optimizations += [
3050   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
3051   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
3052   # '(al * bh) << 16 + c': If either 'al' or 'bh' is zero, return 'c'.
3053   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
3054   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
3055]
3056
3057# These kinds of sequences can occur after nir_opt_peephole_select.
3058#
3059# NOTE: fadd is not handled here because that gets in the way of ffma
3060# generation in the i965 driver.  Instead, fadd and ffma are handled in
3061# late_optimizations.
3062
3063for op in ['flrp']:
3064    optimizations += [
3065        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3066        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3067        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3068        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3069        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
3070        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
3071    ]
3072
3073for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
3074    optimizations += [
3075        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
3076        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
3077        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
3078        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
3079    ]
3080
3081for op in ['fpow']:
3082    optimizations += [
3083        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
3084        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
3085        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
3086        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
3087    ]
3088
3089for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']:
3090    optimizations += [
3091        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
3092    ]
3093
3094for op in ['ineg', 'iabs', 'inot', 'isign']:
3095    optimizations += [
3096        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
3097    ]
3098
3099optimizations.extend([
3100    (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'),
3101    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'),
3102    (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
3103    ])
3104
3105
3106"""
3107  if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16)
3108     return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */;
3109  else
3110     return f2f32(f2f16(val));
3111"""
3112optimizations.extend([
3113    (('fquantize2f16', 'a@32'),
3114     ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
3115               ('iand', a, 1 << 31),
3116               ('!f2f32', ('!f2f16_rtne', a))),
3117     'options->lower_fquantize2f16')
3118    ])
3119
3120for s in range(0, 31):
3121    mask = 0xffffffff << s
3122
3123    # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior
3124    # will never both have the same bits set, replacing the ior with an iadd
3125    # is safe (i.e., a carry out of a bit can never be generated). The iadd is
3126    # more likely to participate in other optimization patterns (e.g., iadd of
3127    # constant reassociation)
3128    optimizations.extend([
3129        (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)),
3130         'options->avoid_ternary_with_two_constants'),
3131    ])
3132
3133# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN.
3134# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here)
3135for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']:
3136    optimizations += [((op, '#a(is_nan)', b), NAN)]
3137    optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative
3138
3139# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN.
3140for op in ['ffma', 'flrp']:
3141    optimizations += [((op, '#a(is_nan)', b, c), NAN)]
3142    optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative
3143    optimizations += [((op, a, b, '#c(is_nan)'), NAN)]
3144
3145# NaN propagation: FP min/max. Pick the non-NaN operand.
3146for op in ['fmin', 'fmax']:
3147    optimizations += [((op, '#a(is_nan)', b), b)] # commutative
3148
3149# NaN propagation: ldexp is NaN if the first operand is NaN.
3150optimizations += [(('ldexp', '#a(is_nan)', b), NAN)]
3151
3152# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN.
3153for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']:
3154    optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative
3155
3156# NaN propagation: FP comparison opcodes except !=. Replace it with false.
3157for op in ['feq', 'fge', 'flt']:
3158    optimizations += [((op, '#a(is_nan)', b), False)]
3159    optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative
3160
3161# NaN propagation: FP comparison opcodes using !=. Replace it with true.
3162# Operator != is the only opcode where a comparison with NaN returns true.
3163for op in ['fneu']:
3164    optimizations += [((op, '#a(is_nan)', b), True)] # commutative
3165
3166# NaN propagation: FP comparison opcodes except != returning FP 0 or 1.
3167for op in ['seq', 'sge', 'slt']:
3168    optimizations += [((op, '#a(is_nan)', b), 0.0)]
3169    optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative
3170
3171# NaN propagation: FP comparison opcodes using != returning FP 0 or 1.
3172# Operator != is the only opcode where a comparison with NaN returns true.
3173optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative
3174
3175# This section contains optimizations to propagate downsizing conversions of
3176# constructed vectors into vectors of downsized components. Whether this is
3177# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
3178# this reduces the register pressure of the vector itself and often enables the
3179# conversions to be eliminated via other algebraic rules or constant folding.
3180# In the worst case on a SIMD architecture, the propagated conversions may be
3181# revectorized via nir_opt_vectorize so instruction count is minimally
3182# impacted.
3183#
3184# On a machine with SIMD-within-a-register only, this actually
3185# counterintuitively hurts instruction count. These machines are the same that
3186# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
3187# not being set.
3188#
3189# Finally for scalar architectures, there should be no difference in generated
3190# code since it all ends up scalarized at the end, but it might minimally help
3191# compile-times.
3192
3193for i in range(2, 4 + 1):
3194   for T in ('f', 'u', 'i'):
3195      vec_inst = ('vec' + str(i),)
3196
3197      indices = ['a', 'b', 'c', 'd']
3198      suffix_in = tuple((indices[j] + '@32') for j in range(i))
3199
3200      to_16 = '{}2{}16'.format(T, T)
3201      to_mp = '{}2{}mp'.format(T, T)
3202
3203      out_16 = tuple((to_16, indices[j]) for j in range(i))
3204      out_mp = tuple((to_mp, indices[j]) for j in range(i))
3205
3206      optimizations  += [
3207         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
3208      ]
3209      # u2ump doesn't exist, because it's equal to i2imp
3210      if T in ['f', 'i']:
3211          optimizations  += [
3212             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
3213          ]
3214
3215# This section contains "late" optimizations that should be run before
3216# creating ffmas and calling regular optimizations for the final time.
3217# Optimizations should go here if they help code generation and conflict
3218# with the regular optimizations.
3219before_ffma_optimizations = [
3220   # Propagate constants down multiplication chains
3221   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
3222   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
3223   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
3224   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
3225
3226   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
3227   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
3228   (('~fadd', ('fneg', a), a), 0.0),
3229   (('iadd', ('ineg', a), a), 0),
3230   (('iadd', ('ineg', a), ('iadd', a, b)), b),
3231   (('iadd', a, ('iadd', ('ineg', a), b)), b),
3232   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
3233   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
3234
3235   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
3236   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
3237   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
3238]
3239
3240# This section contains "late" optimizations that should be run after the
3241# regular optimizations have finished.  Optimizations should go here if
3242# they help code generation but do not necessarily produce code that is
3243# more easily optimizable.
3244late_optimizations = [
3245   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
3246   # results if one operand is +Inf and the other is -Inf.
3247   #
3248   # 1. Inf + -Inf = NaN
3249   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
3250   # 3. ∀x: x != NaN = true
3251   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
3252   #
3253   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
3254   #  (a+b) < 0        false            false       false    false
3255   #      a < -b       false            false       false    false
3256   # -(a+b) < 0        false            false       false    false
3257   #     -a < b        false            false       false    false
3258   #  (a+b) >= 0       false            false       false    false
3259   #      a >= -b      true             true        false    false
3260   # -(a+b) >= 0       false            false       false    false
3261   #     -a >= b       true             true        false    false
3262   #  (a+b) == 0       false            false       false    false
3263   #      a == -b      true             true        false    false
3264   #  (a+b) != 0       true             true        true     true
3265   #      a != -b      false            false       true     true
3266   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
3267   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
3268   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
3269   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
3270   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
3271   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
3272   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
3273   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
3274   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
3275   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
3276
3277   # If either source must be finite, then the original (a+b) cannot produce
3278   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
3279   # result if b is NaN. Therefore, the replacements are exact.
3280   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
3281   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
3282   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
3283   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
3284   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
3285   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
3286
3287   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
3288   # SpvOpLessOrGreater.
3289   *add_fabs_fneg((('iand', ('fneu', 'ma', 'mb'), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma'))), {'ma' : a, 'mb' : b}),
3290   (('iand', ('fneu', a, 0.0), ('feq', a, a)), ('!flt', 0.0, ('fabs', a))),
3291
3292   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
3293   # !SpvOpLessOrGreater.
3294   *add_fabs_fneg((('ior', ('feq', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma')))), {'ma' : a, 'mb' : b}),
3295   (('ior', ('feq', a, 0.0), ('fneu', a, a)), ('inot', ('!flt', 0.0, ('fabs', a)))),
3296
3297   *add_fabs_fneg((('ior', ('flt', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('fge', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False),
3298   *add_fabs_fneg((('ior', ('fge', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('flt', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False),
3299   *add_fabs_fneg((('ior', ('flt', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('fge', 'ma', b))), {'ma' : a}),
3300   *add_fabs_fneg((('ior', ('fge', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('flt', 'ma', b))), {'ma' : a}),
3301   *add_fabs_fneg((('ior', ('flt', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('fge', a, 'mb'))), {'mb' : b}),
3302   *add_fabs_fneg((('ior', ('fge', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('flt', a, 'mb'))), {'mb' : b}),
3303   *add_fabs_fneg((('iand', ('fneu', 'ma', 'b(is_a_number)'), ('feq', a, a)), ('fneo', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}),
3304   *add_fabs_fneg((('ior', ('feq', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('fequ', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}),
3305
3306   (('ior', ('flt', a, b), ('flt', b, a)), ('fneo', a, b), 'options->has_fneo_fcmpu'),
3307   (('flt', 0.0, ('fabs', a)), ('fneo', 0.0, a), 'options->has_fneo_fcmpu'),
3308
3309
3310   # These don't interfere with the previous optimizations which include this
3311   # in the search expression, because nir_algebraic_impl visits instructions
3312   # in reverse order.
3313   (('ior', ('fneu', 'a@16', a), ('fneu', 'b@16', b)), ('funord', a, b), 'options->has_ford_funord'),
3314   (('iand', ('feq', 'a@16', a), ('feq', 'b@16', b)), ('ford', a, b), 'options->has_ford_funord'),
3315   (('ior', ('fneu', 'a@32', a), ('fneu', 'b@32', b)), ('funord', a, b), 'options->has_ford_funord'),
3316   (('iand', ('feq', 'a@32', a), ('feq', 'b@32', b)), ('ford', a, b), 'options->has_ford_funord'),
3317   (('ior', ('fneu', 'a@64', a), ('fneu', 'b@64', b)), ('funord', a, b), 'options->has_ford_funord'),
3318   (('iand', ('feq', 'a@64', a), ('feq', 'b@64', b)), ('ford', a, b), 'options->has_ford_funord'),
3319
3320   (('inot', ('ford(is_used_once)', a, b)), ('funord', a, b)),
3321   (('inot', ('funord(is_used_once)', a, b)), ('ford', a, b)),
3322   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
3323   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
3324   (('inot', ('fequ(is_used_once)', a, b)), ('fneo', a, b)),
3325   (('inot', ('fneo(is_used_once)', a, b)), ('fequ', a, b)),
3326   (('inot', ('flt(is_used_once)', a, b)), ('fgeu', a, b), 'options->has_fneo_fcmpu'),
3327   (('inot', ('fgeu(is_used_once)', a, b)), ('flt', a, b)),
3328   (('inot', ('fge(is_used_once)', a, b)), ('fltu', a, b), 'options->has_fneo_fcmpu'),
3329   (('inot', ('fltu(is_used_once)', a, b)), ('fge', a, b)),
3330
3331   # nir_lower_to_source_mods will collapse this, but its existence during the
3332   # optimization loop can prevent other optimizations.
3333   (('fneg', ('fneg', a)), a),
3334
3335   # combine imul and iadd to imad
3336   (('iadd@32', ('imul(is_only_used_by_iadd)', a, b), c), ('imad', a, b, c), 'options->has_imad32'),
3337
3338   # Drivers do not actually implement udiv_aligned_4, it is just used to
3339   # optimize scratch lowering.
3340   (('udiv_aligned_4', a), ('ushr', a, 2)),
3341]
3342
3343# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
3344# gets combined to fma(a, b, -c).
3345for sz, mulz in itertools.product([16, 32, 64], [False, True]):
3346    # fmulz/ffmaz only for fp32
3347    if mulz and sz != 32:
3348        continue
3349
3350    # Fuse the correct fmul. Only consider fmuls where the only users are fadd
3351    # (or fneg/fabs which are assumed to be propagated away), as a heuristic to
3352    # avoid fusing in cases where it's harmful.
3353    fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)'
3354    ffma = 'ffmaz' if mulz else 'ffma'
3355
3356    fadd = '~fadd@{}'.format(sz)
3357    option = 'options->fuse_ffma{}'.format(sz)
3358
3359    late_optimizations.extend([
3360        ((fadd, (fmul, a, b), c), (ffma, a, b, c), option),
3361
3362        ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c),
3363         (ffma, ('fneg', a), b, c), option),
3364
3365        ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c),
3366         (ffma, ('fabs', a), ('fabs', b), c), option),
3367
3368        ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c),
3369         (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option),
3370    ])
3371
3372late_optimizations.extend([
3373   # Subtractions get lowered during optimization, so we need to recombine them
3374   (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3375   (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3376   (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3377   (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'),
3378
3379   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
3380   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
3381   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
3382   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
3383])
3384
3385for s in [8, 16, 32, 64]:
3386   cond = 'options->has_iadd3'
3387   if s == 64:
3388      cond += ' && !(options->lower_int64_options & nir_lower_iadd3_64)'
3389
3390   iadd = "iadd@{}".format(s)
3391
3392   # On Intel GPUs, the constant field for an ADD3 instruction must be either
3393   # int16_t or uint16_t.
3394   late_optimizations.extend([
3395      ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond),
3396      ((iadd, ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond),
3397      ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'),   ('iadd3', a, b, c), cond),
3398      ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3399      ((iadd, ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3400      ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'),  ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3401
3402      ((iadd, ('ishl', a, 1), 'b(is_not_const)'), ('iadd3', a, a, b), cond),
3403      ((iadd, ('ishl', a, 1), '#b(is_16_bits)' ), ('iadd3', a, a, b), cond),
3404      ((iadd, ('ineg', ('ishl', a, 1)), 'b(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', a), b), cond),
3405      ((iadd, ('ineg', ('ishl', a, 1)), '#b(is_16_bits)' ), ('iadd3', ('ineg', a), ('ineg', a), b), cond),
3406
3407      # Use special checks to ensure (b+b) or -(b+b) fit in 16 bits.
3408      (('ishl@{}'.format(s), ('iadd', a, '#b(is_2x_16_bits)'), 1), ('iadd3', a, a, ('iadd', b, b)), cond),
3409      (('ishl@{}'.format(s), ('ineg', ('iadd', a, '#b(is_neg2x_16_bits)')), 1), ('iadd3', ('ineg', a), ('ineg', a), ('ineg', ('iadd', b, b))), cond),
3410   ])
3411
3412late_optimizations.extend([
3413    # fneg_lo / fneg_hi
3414   (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),
3415   (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),
3416
3417   # These are duplicated from the main optimizations table.  The late
3418   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
3419   # new patterns like these.  The patterns that compare with zero are removed
3420   # because they are unlikely to be created in by anything in
3421   # late_optimizations.
3422   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
3423   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
3424   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
3425   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
3426
3427   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
3428
3429   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
3430
3431   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
3432   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
3433   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
3434   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
3435   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
3436   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
3437   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
3438   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
3439   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
3440   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
3441
3442   (('ior', a, a), a),
3443   (('iand', a, a), a),
3444
3445   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
3446
3447   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
3448   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
3449   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
3450   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
3451
3452   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
3453
3454   # Approximate handling of fround_even for DX9 addressing from gallium nine on
3455   # DX9-class hardware with no proper fround support.  This is in
3456   # late_optimizations so that the is_integral() opts in the main pass get a
3457   # chance to eliminate the fround_even first.
3458   (('fround_even', a), ('bcsel',
3459                         ('feq', ('ffract', a), 0.5),
3460                         ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),
3461                         ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),
3462
3463   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
3464   # particular operation is common for expanding values stored in a texture
3465   # from [0,1] to [-1,1].
3466   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3467   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3468   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3469   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3470   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3471   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3472   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3473   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3474
3475    # flrp(a, b, a)
3476    # a*(1-a) + b*a
3477    # a + -a*a + a*b    (1)
3478    # a + a*(b - a)
3479    # Option 1: ffma(a, (b-a), a)
3480    #
3481    # Alternately, after (1):
3482    # a*(1+b) + -a*a
3483    # a*((1+b) + -a)
3484    #
3485    # Let b=1
3486    #
3487    # Option 2: ffma(a, 2, -(a*a))
3488    # Option 3: ffma(a, 2, (-a)*a)
3489    # Option 4: ffma(a, -a, (2*a)
3490    # Option 5: a * (2 - a)
3491    #
3492    # There are a lot of other possible combinations.
3493   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
3494   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3495   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3496   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3497   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3498
3499   # we do these late so that we don't get in the way of creating ffmas
3500   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
3501   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
3502
3503   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
3504   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
3505   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
3506    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
3507
3508   # Things that look like DPH in the source shader may get expanded to
3509   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
3510   # to NIR.  After FFMA is generated, this can look like:
3511   #
3512   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
3513   #
3514   # Reassociate the last addition into the first multiplication.
3515   #
3516   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
3517   # shader stages on some outputs that are intended to be invariant.  For
3518   # various reasons, this optimization may not be fully applied in all
3519   # shaders used for different rendering passes of the same geometry.  This
3520   # can result in Z-fighting artifacts (at best).  For now, disable this
3521   # optimization in these stages.  See bugzilla #111490.  In tessellation
3522   # stages applications seem to use 'precise' when necessary, so allow the
3523   # optimization in those stages.
3524   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma(is_used_once)', c, d, ('ffma', e, 'f', ('fmul(is_used_once)', 'g(is_not_const_and_not_fsign)', 'h(is_not_const_and_not_fsign)')))), 'i(is_not_const)'),
3525    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', ('ffma', 'g', 'h', 'i')))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3526   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3527    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3528   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3529    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3530   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3531    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3532
3533   (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3534    ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3535   (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3536    ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3537   (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3538    ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3539
3540   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
3541   #
3542   #    If bits is zero, the result will be zero.
3543   #
3544   # These prevent the next two lowerings generating incorrect results when
3545   # count is zero.
3546   (('ubfe', a, b, 0), 0),
3547   (('ibfe', a, b, 0), 0),
3548
3549   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
3550   # instructions on Intel GPUs, it cannot have an immediate values as
3551   # sources.  There are also limitations on source register strides.  As a
3552   # result, it is very easy for 3-source instruction combined with either
3553   # loads of immediate values or copies from weird register strides to be
3554   # more expensive than the primitive instructions it represents.
3555   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
3556
3557   # b is the lowest order bit to be extracted and c is the number of bits to
3558   # extract.  The inner shift removes the bits above b + c by shifting left
3559   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
3560   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
3561   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
3562   # This means that it must be shifted right by 32 - c or -c bits.
3563   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
3564
3565   # Clean up no-op shifts that may result from the bfe lowerings.
3566   (('ishl', a, 0), a),
3567   (('ishl', a, -32), a),
3568   (('ishr', a, 0), a),
3569   (('ishr', a, -32), a),
3570   (('ushr', a, 0), a),
3571
3572   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
3573   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
3574   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
3575   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
3576
3577   # open coded bit test
3578   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3579   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3580   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3581   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3582   (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'),
3583   (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'),
3584   (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'),
3585   (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'),
3586   (('bitz', ('ushr', a, b), 0), ('bitz', a, b)),
3587   (('bitz', ('ishr', a, b), 0), ('bitz', a, b)),
3588   (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)),
3589   (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)),
3590   (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3591   (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3592   (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'),
3593   (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'),
3594   (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3595   (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3596   (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'),
3597   (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'),
3598   (('inot', ('bitnz', a, b)), ('bitz', a, b)),
3599   (('inot', ('bitz', a, b)), ('bitnz', a, b)),
3600   (('bitnz', ('inot', a), b), ('bitz', a, b)),
3601   (('bitz', ('inot', a), b), ('bitnz', a, b)),
3602])
3603
3604# A few more extract cases we'd rather leave late
3605for N in [16, 32]:
3606    aN = 'a@{0}'.format(N)
3607    u2uM = 'u2u{0}'.format(M)
3608    i2iM = 'i2i{0}'.format(M)
3609
3610    for x in ['u', 'i']:
3611        x2xN = '{0}2{0}{1}'.format(x, N)
3612        extract_x8 = 'extract_{0}8'.format(x)
3613        extract_x16 = 'extract_{0}16'.format(x)
3614
3615        late_optimizations.extend([
3616            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3617            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3618        ])
3619
3620        if N > 16:
3621            late_optimizations.extend([
3622                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3623                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3624            ])
3625
3626# Byte insertion
3627late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3628late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3629late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
3630
3631late_optimizations += [
3632   # Word insertion
3633   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
3634
3635   # Extract and then insert
3636   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
3637   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
3638]
3639
3640# Float sizes
3641for s in [16, 32, 64]:
3642    late_optimizations.extend([
3643       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
3644       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
3645    ])
3646
3647for op in ['fadd']:
3648    late_optimizations += [
3649        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
3650        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
3651    ]
3652
3653for op in ['ffma', 'ffmaz']:
3654    late_optimizations += [
3655        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3656        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3657
3658        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3659        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3660    ]
3661
3662# mediump: If an opcode is surrounded by conversions, remove the conversions.
3663# The rationale is that type conversions + the low precision opcode are more
3664# expensive that the same arithmetic opcode at higher precision.
3665#
3666# This must be done in late optimizations, because we need normal optimizations to
3667# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
3668#
3669# Unary opcodes
3670for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
3671           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
3672    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
3673
3674# Binary opcodes
3675for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
3676    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
3677
3678# Ternary opcodes
3679for op in ['ffma', 'flrp']:
3680    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
3681
3682# Comparison opcodes
3683for op in ['feq', 'fge', 'flt', 'fneu']:
3684    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
3685
3686# Do this last, so that the f2fmp patterns above have effect.
3687late_optimizations += [
3688  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
3689  # any conversions that could have been removed will have been removed in
3690  # nir_opt_algebraic so any remaining ones are required.
3691  (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"),
3692  (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"),
3693  (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"),
3694  (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"),
3695  (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"),
3696  (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"),
3697  (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"),
3698  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
3699
3700  (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"),
3701
3702  (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3703  (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3704  (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3705  (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3706
3707  (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3708  (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3709  (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3710  (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3711
3712  (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3713  (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3714  (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3715  (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3716]
3717
3718for s in [16, 32, 64]:
3719    late_optimizations.extend([
3720        (('bcsel@{}'.format(s), ('ieq', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, b, c), "options->has_icsel_eqz{} && !options->no_integers".format(s)),
3721        (('bcsel@{}'.format(s), ('ine', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, c, b), "options->has_icsel_eqz{} && !options->no_integers".format(s)),
3722    ])
3723
3724distribute_src_mods = [
3725   # Try to remove some spurious negations rather than pushing them down.
3726   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
3727   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
3728   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
3729   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
3730   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
3731   (('fneg', ('fneg', a)), a),
3732
3733   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
3734   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
3735
3736   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
3737   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
3738   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
3739
3740   # Note that fmin <-> fmax.  I don't think there is a way to distribute
3741   # fabs() into fmin or fmax.
3742   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
3743   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
3744
3745   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
3746   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
3747   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
3748
3749   # fdph works mostly like fdot, but to get the correct result, the negation
3750   # must be applied to the second source.
3751   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
3752
3753   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
3754   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
3755]
3756
3757before_lower_int64_optimizations = [
3758    # The i2i64(a) implies that 'a' has at most 32-bits of data.
3759    (('ishl', ('i2i64', a), b),
3760     # Effective shift count of zero, just return 'a'.
3761     ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a),
3762      ('bcsel', ('ilt', ('iand', b, 63), 32),
3763       # Shifting less than 32 bits, so both 32-bit halves will have
3764       # some data. These (and the else case) shift counts are of 32-bit
3765       # values, so the shift counts are implicitly moduolo 32.
3766       ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a),          ('iadd', ('ineg', b), 32) )),
3767       # Shifting 32 bits or more, so lower 32 bits must be zero.
3768       ('pack_64_2x32_split', 0                        , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
3769     '(options->lower_int64_options & nir_lower_shift64) != 0'),
3770
3771    (('ishl', ('u2u64', a), b),
3772     ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a),
3773      ('bcsel', ('ilt', ('iand', b, 63), 32),
3774       ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a),          ('iadd', ('ineg', b), 32) )),
3775       ('pack_64_2x32_split', 0                        , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
3776     '(options->lower_int64_options & nir_lower_shift64) != 0'),
3777
3778    # If ineg64 is lowered, then the negation is not free. Try to eliminate
3779    # some of the negations.
3780    (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3781    (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3782    (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3783    (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3784
3785    (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)),
3786    (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)),
3787
3788    # If the hardware can do int64, the shift is the same cost as the add. It
3789    # should be fine to do this transformation unconditionally.
3790    (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)),
3791    (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)),
3792]
3793
3794parser = argparse.ArgumentParser()
3795parser.add_argument('--out', required=True)
3796args = parser.parse_args()
3797
3798with open(args.out, "w", encoding='utf-8') as f:
3799    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
3800    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
3801                                        before_ffma_optimizations).render())
3802    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64",
3803                                        before_lower_int64_optimizations).render())
3804    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
3805                                        late_optimizations).render())
3806    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
3807                                        distribute_src_mods).render())
3808