• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23#
24# Authors:
25#    Jason Ekstrand (jason@jlekstrand.net)
26
27from __future__ import print_function
28
29from collections import OrderedDict
30import nir_algebraic
31from nir_opcodes import type_sizes
32import itertools
33import struct
34from math import pi
35
36# Convenience variables
37a = 'a'
38b = 'b'
39c = 'c'
40d = 'd'
41e = 'e'
42
43# Written in the form (<search>, <replace>) where <search> is an expression
44# and <replace> is either an expression or a value.  An expression is
45# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
46# where each source is either an expression or a value.  A value can be
47# either a numeric constant or a string representing a variable name.
48#
49# If the opcode in a search expression is prefixed by a '~' character, this
50# indicates that the operation is inexact.  Such operations will only get
51# applied to SSA values that do not have the exact bit set.  This should be
52# used by by any optimizations that are not bit-for-bit exact.  It should not,
53# however, be used for backend-requested lowering operations as those need to
54# happen regardless of precision.
55#
56# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
57# "#" indicates that the given variable will only match constants,
58# type indicates that the given variable will only match values from ALU
59#    instructions with the given output type,
60# (cond) specifies an additional condition function (see nir_search_helpers.h),
61# swiz is a swizzle applied to the variable (only in the <replace> expression)
62#
63# For constants, you have to be careful to make sure that it is the right
64# type because python is unaware of the source and destination types of the
65# opcodes.
66#
67# All expression types can have a bit-size specified.  For opcodes, this
68# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
69# type and size.  In the search half of the expression this indicates that it
70# should only match that particular bit-size.  In the replace half of the
71# expression this indicates that the constructed value should have that
72# bit-size.
73#
74# If the opcode in a replacement expression is prefixed by a '!' character,
75# this indicated that the new expression will be marked exact.
76#
77# A special condition "many-comm-expr" can be used with expressions to note
78# that the expression and its subexpressions have more commutative expressions
79# than nir_replace_instr can handle.  If this special condition is needed with
80# another condition, the two can be separated by a comma (e.g.,
81# "(many-comm-expr,is_used_once)").
82
83# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
84def lowered_sincos(c):
85    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
86    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
87    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
88
89def intBitsToFloat(i):
90    return struct.unpack('!f', struct.pack('!I', i))[0]
91
92optimizations = [
93
94   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
95   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
96   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
97
98   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
99   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
100   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
101   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
102   (('udiv', a, 1), a),
103   (('idiv', a, 1), a),
104   (('umod', a, 1), 0),
105   (('imod', a, 1), 0),
106   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
107   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
108   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
109   (('umod', a, '#b(is_pos_power_of_two)'),    ('iand', a, ('isub', b, 1))),
110
111   (('~fneg', ('fneg', a)), a),
112   (('ineg', ('ineg', a)), a),
113   (('fabs', ('fneg', a)), ('fabs', a)),
114   (('fabs', ('u2f', a)), ('u2f', a)),
115   (('iabs', ('iabs', a)), ('iabs', a)),
116   (('iabs', ('ineg', a)), ('iabs', a)),
117   (('f2b', ('fneg', a)), ('f2b', a)),
118   (('i2b', ('ineg', a)), ('i2b', a)),
119   (('~fadd', a, 0.0), a),
120   (('iadd', a, 0), a),
121   (('usadd_4x8', a, 0), a),
122   (('usadd_4x8', a, ~0), ~0),
123   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
124   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
125   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
126   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
127   (('~fadd', ('fneg', a), a), 0.0),
128   (('iadd', ('ineg', a), a), 0),
129   (('iadd', ('ineg', a), ('iadd', a, b)), b),
130   (('iadd', a, ('iadd', ('ineg', a), b)), b),
131   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
132   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
133   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
134   (('~fmul', a, 0.0), 0.0),
135   (('imul', a, 0), 0),
136   (('umul_unorm_4x8', a, 0), 0),
137   (('umul_unorm_4x8', a, ~0), a),
138   (('~fmul', a, 1.0), a),
139   (('imul', a, 1), a),
140   (('fmul', a, -1.0), ('fneg', a)),
141   (('imul', a, -1), ('ineg', a)),
142   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
143   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
144   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
145   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
146   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
147   (('~ffma', 0.0, a, b), b),
148   (('~ffma', a, b, 0.0), ('fmul', a, b)),
149   (('ffma', 1.0, a, b), ('fadd', a, b)),
150   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
151   (('~flrp', a, b, 0.0), a),
152   (('~flrp', a, b, 1.0), b),
153   (('~flrp', a, a, b), a),
154   (('~flrp', 0.0, a, b), ('fmul', a, b)),
155
156   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
157   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
158]
159
160# Float sizes
161for s in [16, 32, 64]:
162    optimizations.extend([
163       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
164       (('~flrp@{}'.format(s), ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
165       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
166
167       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
168
169       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
170       # These are the same as the previous three rules, but it depends on
171       # 1-fsat(x) <=> fsat(1-x).  See below.
172       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
173       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
174
175       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
176       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
177
178       # 1 - ((1 - a) * (1 - b))
179       # 1 - (1 - a - b + a*b)
180       # 1 - 1 + a + b - a*b
181       # a + b - a*b
182       # a + b*(1 - a)
183       # b*(1 - a) + 1*a
184       # flrp(b, 1, a)
185       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
186    ])
187
188optimizations.extend([
189   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
190
191   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
192   (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
193   (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
194   (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
195   (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
196   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
197   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
198   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
199   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
200   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
201   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
202   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
203   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
204
205   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
206    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
207
208   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
209
210   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
211   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
212   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
213   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
214
215   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
216   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
217
218   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
219   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
220
221   # Lower fdot to fsum when it is available
222   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
223   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
224   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
225   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
226
227   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
228   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
229   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
230   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
231
232   # (a * #b + #c) << #d
233   # ((a * #b) << #d) + (#c << #d)
234   # (a * (#b << #d)) + (#c << #d)
235   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
236    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
237
238   # (a * #b) << #c
239   # a * (#b << #c)
240   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
241])
242
243# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
244# bits of the second source.  These replacements must correctly handle the
245# case where (b % bitsize) + (c % bitsize) >= bitsize.
246for s in [8, 16, 32, 64]:
247   mask = (1 << s) - 1
248
249   ishl = "ishl@{}".format(s)
250   ishr = "ishr@{}".format(s)
251   ushr = "ushr@{}".format(s)
252
253   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
254
255   optimizations.extend([
256       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
257       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
258
259       # To get get -1 for large shifts of negative values, ishr must instead
260       # clamp the shift count to the maximum value.
261       ((ishr, (ishr, a, '#b'), '#c'),
262        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
263   ])
264
265# Optimize a pattern of address calculation created by DXVK where the offset is
266# divided by 4 and then multipled by 4. This can be turned into an iand and the
267# additions before can be reassociated to CSE the iand instruction.
268for log2 in range(1, 7): # powers of two from 2 to 64
269   v = 1 << log2
270   mask = 0xffffffff & ~(v - 1)
271   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
272
273   optimizations.extend([
274       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
275       (('ishl', ('ushr', a, log2), log2), ('iand', a, mask)),
276
277       # Reassociate for improved CSE
278       (('iand', ('iadd', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
279   ])
280
281# To save space in the state tables, reduce to the set that is known to help.
282# Previously, this was range(1, 32).  In addition, a couple rules inside the
283# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
284# resolution.
285for i in [1, 2, 16, 24]:
286    lo_mask = 0xffffffff >> i
287    hi_mask = (0xffffffff << i) & 0xffffffff
288
289    optimizations.extend([
290        # This pattern seems to only help in the soft-fp64 code.
291        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
292#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
293#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
294
295        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
296        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
297#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
298    ])
299
300optimizations.extend([
301   # This is common for address calculations.  Reassociating may enable the
302   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
303   # instruction or a constant offset field for in load / store instructions.
304   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
305
306   # (a + #b) * #c
307   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
308
309   # Comparison simplifications
310   (('~inot', ('flt', a, b)), ('fge', a, b)),
311   (('~inot', ('fge', a, b)), ('flt', a, b)),
312   (('inot', ('feq', a, b)), ('fneu', a, b)),
313   (('inot', ('fneu', a, b)), ('feq', a, b)),
314   (('inot', ('ilt', a, b)), ('ige', a, b)),
315   (('inot', ('ult', a, b)), ('uge', a, b)),
316   (('inot', ('ige', a, b)), ('ilt', a, b)),
317   (('inot', ('uge', a, b)), ('ult', a, b)),
318   (('inot', ('ieq', a, b)), ('ine', a, b)),
319   (('inot', ('ine', a, b)), ('ieq', a, b)),
320
321   (('iand', ('feq', a, b), ('fneu', a, b)), False),
322   (('iand', ('flt', a, b), ('flt', b, a)), False),
323   (('iand', ('ieq', a, b), ('ine', a, b)), False),
324   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
325   (('iand', ('ult', a, b), ('ult', b, a)), False),
326
327   # This helps some shaders because, after some optimizations, they end up
328   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
329   # matching would be handled by CSE.
330   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
331   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
332   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
333   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
334   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
335   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
336   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
337   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
338   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
339   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
340
341   # flt(fsat(a), b > 0 && b < 1) is inexact if a is NaN (fsat(NaN) is 0)
342   # because it returns True while flt(a, b) always returns False.
343   (('~flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
344   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
345   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
346   # fge(b > 0 && b < 1, fsat(a)) is inexact if a is NaN (fsat(NaN) is 0)
347   # because it returns True while fge(b, a) always returns False.
348   (('~fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
349   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
350   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
351
352   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
353   (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
354   (('fge', 0.0, ('fsat(is_used_once)', a)), ('fge', 0.0, a)),
355   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
356
357   # 0.0 >= b2f(a)
358   # b2f(a) <= 0.0
359   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
360   # inot(a)
361   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
362
363   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
364
365   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
366   (('fneu', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
367   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
368   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
369   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
370   (('fneu', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
371   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
372   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
373   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
374   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
375   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
376   (('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
377   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
378   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
379   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
380   (('feq', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
381   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
382   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
383   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
384   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
385
386   # -(b2f(a) + b2f(b)) < 0
387   # 0 < b2f(a) + b2f(b)
388   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
389   # a || b
390   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
391   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
392
393   # -(b2f(a) + b2f(b)) >= 0
394   # 0 >= b2f(a) + b2f(b)
395   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
396   # !(a || b)
397   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
398   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
399
400   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
401   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
402
403   # Some optimizations (below) convert things like (a < b || c < b) into
404   # (min(a, c) < b).  However, this interfers with the previous optimizations
405   # that try to remove comparisons with negated sums of b2f.  This just
406   # breaks that apart.
407   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
408    ('ior', ('flt', c, 0.0), ('ior', a, b))),
409
410   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
411   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
412   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
413   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
414   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
415   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
416   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
417   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
418   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
419   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
420   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
421   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
422
423   # Cannot remove the addition from ilt or ige due to overflow.
424   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
425   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
426
427   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
428   (('~fneu', ('b2f', 'a@1'), 0.0), a),
429   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
430   (('ine', ('b2i', 'a@1'), 0),   a),
431
432   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
433   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
434   (('fge', ('u2f', a), 0.0), True),
435   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
436   (('flt', ('u2f', a), 0.0), False),
437   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
438   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
439   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
440   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
441   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
442   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
443   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
444
445   # 0.0 < fabs(a)
446   # fabs(a) > 0.0
447   # fabs(a) != 0.0 because fabs(a) must be >= 0
448   # a != 0.0
449   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
450
451   # -fabs(a) < 0.0
452   # fabs(a) > 0.0
453   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
454
455   # 0.0 >= fabs(a)
456   # 0.0 == fabs(a)   because fabs(a) must be >= 0
457   # 0.0 == a
458   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
459
460   # -fabs(a) >= 0.0
461   # 0.0 >= fabs(a)
462   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
463
464   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
465   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
466
467   # (a < 0.0) || (a > 1.0)
468   # !(!(a < 0.0) && !(a > 1.0))
469   # !((a >= 0.0) && (a <= 1.0))
470   # !(a == fsat(a))
471   # a != fsat(a)
472   (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fneu', a, ('fsat', a)), '!options->lower_fsat'),
473
474   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
475   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
476   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
477   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
478
479   # fmin(b2f(a), b)
480   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
481   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
482   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
483   #
484   # Since b is a constant, constant folding will eliminate the fmin and the
485   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
486   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
487
488   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
489
490   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
491   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
492   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
493   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
494   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
495   (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
496   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
497   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
498   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
499   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
500   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
501   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
502   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
503   (('bcsel', a, True, b), ('ior', a, b)),
504   (('bcsel', a, a, b), ('ior', a, b)),
505   (('bcsel', a, b, False), ('iand', a, b)),
506   (('bcsel', a, b, a), ('iand', a, b)),
507   (('~fmin', a, a), a),
508   (('~fmax', a, a), a),
509   (('imin', a, a), a),
510   (('imax', a, a), a),
511   (('umin', a, a), a),
512   (('umax', a, a), a),
513   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
514   (('umax', ('umax', a, b), b), ('umax', a, b)),
515   (('imax', ('imax', a, b), b), ('imax', a, b)),
516   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
517   (('umin', ('umin', a, b), b), ('umin', a, b)),
518   (('imin', ('imin', a, b), b), ('imin', a, b)),
519])
520
521# Integer sizes
522for s in [8, 16, 32, 64]:
523    optimizations.extend([
524       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
525
526       # Simplify logic to detect sign of an integer.
527       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
528       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
529       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
530       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
531       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
532       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
533       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
534       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
535       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
536       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
537       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
538       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
539    ])
540
541optimizations.extend([
542   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
543   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
544   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
545   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
546   (('~fmin', a, ('fabs', a)), a),
547   (('imin', a, ('iabs', a)), a),
548   (('~fmax', a, ('fneg', ('fabs', a))), a),
549   (('imax', a, ('ineg', ('iabs', a))), a),
550   (('fmax', a, ('fabs', a)), ('fabs', a)),
551   (('imax', a, ('iabs', a)), ('iabs', a)),
552   (('fmax', a, ('fneg', a)), ('fabs', a)),
553   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
554   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
555   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
556   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
557   # fsat(a) returns 0.0.
558   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
559   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
560   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
561   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
562   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
563   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
564   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
565   # representing this in the optimizations other than the usual ~.
566   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
567   (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
568   (('fsat', ('b2f', a)), ('b2f', a)),
569   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
570   (('fsat', ('fsat', a)), ('fsat', a)),
571   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
572   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
573   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
574   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
575   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
576   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
577   # Both the left and right patterns are "b" when isnan(a), so this is exact.
578   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
579   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
580   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
581   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
582
583   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
584   # fsat(b-a).
585   #
586   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
587   #
588   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
589   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
590    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
591
592   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
593   (('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
594   (('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
595   (('~ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
596   (('~ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
597   (('~ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmax', b, c))),
598   (('~ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmin', a, b), c)),
599   (('~ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmin', b, c))),
600   (('~ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmax', a, b), c)),
601   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
602   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
603   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
604   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
605   (('~iand', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmin', b, c))),
606   (('~iand', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmax', a, b), c)),
607   (('~iand', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmax', b, c))),
608   (('~iand', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmin', a, b), c)),
609
610   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
611   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
612   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
613   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
614   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
615   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
616   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
617   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
618   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
619   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
620   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
621   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
622   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
623   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
624   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
625   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
626])
627
628# Float sizes
629for s in [16, 32, 64]:
630    optimizations.extend([
631       # These derive from the previous patterns with the application of b < 0 <=>
632       # 0 < -b.  The transformation should be applied if either comparison is
633       # used once as this ensures that the number of comparisons will not
634       # increase.  The sources to the ior and iand are not symmetric, so the
635       # rules have to be duplicated to get this behavior.
636       (('~ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
637       (('~ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
638       (('~ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
639       (('~ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
640       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
641       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
642       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
643       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
644
645       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
646       # with the bcsel, it's basically copysign(1.0, a).  There are some
647       # behavior differences between this pattern and copysign w.r.t. ±0 and
648       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
649       # to x, regardless of whether either or both values are NaN.
650       #
651       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
652       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
653       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
654       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
655       #
656       # For all other values of 'a', the original and replacement behave as
657       # copysign.
658       #
659       # Marking the replacement comparisons as precise prevents any future
660       # optimizations from replacing either of the comparisons with the
661       # logical-not of the other.
662       #
663       # Note: Use b2i32 in the replacement because some platforms that
664       # support fp16 don't support int16.
665       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
666        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
667
668       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
669
670       # The C spec says, "If the value of the integral part cannot be represented
671       # by the integer type, the behavior is undefined."  "Undefined" can mean
672       # "the conversion doesn't happen at all."
673       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
674
675       # Ironically, mark these as imprecise because removing the conversions may
676       # preserve more precision than doing the conversions (e.g.,
677       # uint(float(0x81818181u)) == 0x81818200).
678       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
679       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
680       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
681       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
682
683       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'),
684       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'),
685    ])
686
687    # float? -> float? -> floatS ==> float? -> floatS
688    (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
689
690    # int? -> float? -> floatS ==> int? -> floatS
691    (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
692    (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
693
694    # float? -> float? -> intS ==> float? -> intS
695    (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
696    (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
697
698    for B in [32, 64]:
699        if s < B:
700            optimizations.extend([
701               # S = smaller, B = bigger
702               # typeS -> typeB -> typeS ==> identity
703               (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
704               (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a),
705               (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a),
706
707               # bool1 -> typeB -> typeS ==> bool1 -> typeS
708               (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)),
709               (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
710               (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
711
712               # floatS -> floatB -> intB ==> floatS -> intB
713               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
714               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
715
716               # int? -> floatB -> floatS ==> int? -> floatS
717               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
718               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
719
720               # intS -> intB -> floatB ==> intS -> floatB
721               (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)),
722               (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)),
723            ])
724
725# mediump variants of the above
726optimizations.extend([
727    # int32 -> float32 -> float16 ==> int32 -> float16
728    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
729    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
730
731    # float32 -> float16 -> int16 ==> float32 -> int16
732    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
733    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
734
735    # float32 -> int32 -> int16 ==> float32 -> int16
736    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
737    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
738
739    # int32 -> int16 -> float16 ==> int32 -> float16
740    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
741    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
742])
743
744# Integer sizes
745for s in [8, 16, 32, 64]:
746    optimizations.extend([
747       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),
748       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),
749       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),
750       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),
751       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),
752       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),
753
754       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
755       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
756
757       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
758       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
759       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
760       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
761    ])
762
763optimizations.extend([
764   # Common pattern like 'if (i == 0 || i == 1 || ...)'
765   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
766   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
767   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
768
769   (('ior', a, ('ieq', a, False)), True),
770   (('ior', a, ('inot', a)), -1),
771
772   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
773   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
774
775   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
776   # The first part of the iand comes from the !__feq64_nonnan.
777   #
778   # The second pattern is a reformulation of the first based on the relation
779   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
780   # happens to be y == 0.
781   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
782    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
783   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
784    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
785
786   # These patterns can result when (a < b || a < c) => (a < min(b, c))
787   # transformations occur before constant propagation and loop-unrolling.
788   (('~flt', a, ('fmax', b, a)), ('flt', a, b)),
789   (('~flt', ('fmin', a, b), a), ('flt', b, a)),
790   (('~fge', a, ('fmin', b, a)), True),
791   (('~fge', ('fmax', a, b), a), True),
792   (('~flt', a, ('fmin', b, a)), False),
793   (('~flt', ('fmax', a, b), a), False),
794   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
795   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
796
797   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
798   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
799   (('ige', a, ('imin', b, a)), True),
800   (('ige', ('imax', a, b), a), True),
801   (('ult', a, ('umax', b, a)), ('ult', a, b)),
802   (('ult', ('umin', a, b), a), ('ult', b, a)),
803   (('uge', a, ('umin', b, a)), True),
804   (('uge', ('umax', a, b), a), True),
805   (('ilt', a, ('imin', b, a)), False),
806   (('ilt', ('imax', a, b), a), False),
807   (('ige', a, ('imax', b, a)), ('ige', a, b)),
808   (('ige', ('imin', a, b), a), ('ige', b, a)),
809   (('ult', a, ('umin', b, a)), False),
810   (('ult', ('umax', a, b), a), False),
811   (('uge', a, ('umax', b, a)), ('uge', a, b)),
812   (('uge', ('umin', a, b), a), ('uge', b, a)),
813   (('ult', a, ('iand', b, a)), False),
814   (('ult', ('ior', a, b), a), False),
815   (('uge', a, ('iand', b, a)), True),
816   (('uge', ('ior', a, b), a), True),
817
818   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
819   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
820   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
821   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
822   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
823   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
824   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
825   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
826   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
827   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
828   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
829   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
830   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
831   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
832   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
833   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
834
835   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
836   # negative.
837   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
838    ('iabs', ('ishr', a, b))),
839   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
840
841   (('fabs', ('slt', a, b)), ('slt', a, b)),
842   (('fabs', ('sge', a, b)), ('sge', a, b)),
843   (('fabs', ('seq', a, b)), ('seq', a, b)),
844   (('fabs', ('sne', a, b)), ('sne', a, b)),
845   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
846   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
847   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
848   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
849   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
850   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
851   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
852   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
853   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
854   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
855   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
856   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
857   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
858   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
859   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
860   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
861   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
862   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
863   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
864   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
865   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
866   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
867   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
868   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
869   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
870   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
871
872   (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
873   (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
874   (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
875
876   (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
877   (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
878   (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
879
880   (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
881   (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
882   (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
883
884   (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
885   (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
886   (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
887
888   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
889   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
890   # Emulating booleans
891   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
892   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
893   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
894   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
895   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
896   (('iand', 'a@bool16', 1.0), ('b2f', a)),
897   (('iand', 'a@bool32', 1.0), ('b2f', a)),
898   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
899   # Comparison with the same args.  Note that these are not done for
900   # the float versions because NaN always returns false on float
901   # inequalities.
902   (('ilt', a, a), False),
903   (('ige', a, a), True),
904   (('ieq', a, a), True),
905   (('ine', a, a), False),
906   (('ult', a, a), False),
907   (('uge', a, a), True),
908   # Logical and bit operations
909   (('iand', a, a), a),
910   (('iand', a, ~0), a),
911   (('iand', a, 0), 0),
912   (('ior', a, a), a),
913   (('ior', a, 0), a),
914   (('ior', a, True), True),
915   (('ixor', a, a), 0),
916   (('ixor', a, 0), a),
917   (('inot', ('inot', a)), a),
918   (('ior', ('iand', a, b), b), b),
919   (('ior', ('ior', a, b), b), ('ior', a, b)),
920   (('iand', ('ior', a, b), b), b),
921   (('iand', ('iand', a, b), b), ('iand', a, b)),
922   # DeMorgan's Laws
923   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
924   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
925   # Shift optimizations
926   (('ishl', 0, a), 0),
927   (('ishl', a, 0), a),
928   (('ishr', 0, a), 0),
929   (('ishr', a, 0), a),
930   (('ushr', 0, a), 0),
931   (('ushr', a, 0), a),
932   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
933   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
934   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
935   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
936   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
937   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
938   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
939   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
940   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
941   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
942   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
943   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
944   # Exponential/logarithmic identities
945   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
946   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
947   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
948   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
949   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
950    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
951   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
952   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
953   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
954   (('~fpow', a, 1.0), a),
955   (('~fpow', a, 2.0), ('fmul', a, a)),
956   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
957   (('~fpow', 2.0, a), ('fexp2', a)),
958   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
959   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
960   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
961   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
962   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
963   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
964   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
965   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
966   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
967   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
968   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
969   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
970   # Division and reciprocal
971   (('~fdiv', 1.0, a), ('frcp', a)),
972   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
973   (('~frcp', ('frcp', a)), a),
974   (('~frcp', ('fsqrt', a)), ('frsq', a)),
975   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
976   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
977   # Trig
978   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
979   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
980   # Boolean simplifications
981   (('i2b16(is_used_by_if)', a), ('ine16', a, 0)),
982   (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
983   (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
984   (('ieq', a, True), a),
985   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
986   (('ine', a, False), a),
987   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
988   (('bcsel', a, True, False), a),
989   (('bcsel', a, False, True), ('inot', a)),
990   (('bcsel', True, b, c), b),
991   (('bcsel', False, b, c), c),
992
993   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
994   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
995   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
996   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
997   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
998   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
999   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1000   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1001   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1002   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1003   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1004   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1005
1006   (('bcsel', a, b, b), b),
1007   (('~fcsel', a, b, b), b),
1008
1009   # D3D Boolean emulation
1010   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1011   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1012   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1013   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1014   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1015    ('ineg', ('b2i', ('iand', a, b)))),
1016   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1017    ('ineg', ('b2i', ('ior', a, b)))),
1018   (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1019   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1020   (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
1021   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1022   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1023   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1024
1025   # Conversions
1026   (('i2b16', ('b2i', 'a@16')), a),
1027   (('i2b32', ('b2i', 'a@32')), a),
1028   (('f2i', ('ftrunc', a)), ('f2i', a)),
1029   (('f2u', ('ftrunc', a)), ('f2u', a)),
1030   (('i2b', ('ineg', a)), ('i2b', a)),
1031   (('i2b', ('iabs', a)), ('i2b', a)),
1032   (('inot', ('f2b1', a)), ('feq', a, 0.0)),
1033
1034   # Conversions from 16 bits to 32 bits and back can always be removed
1035   (('f2fmp', ('f2f32', 'a@16')), a),
1036   (('i2imp', ('i2i32', 'a@16')), a),
1037   (('i2imp', ('u2u32', 'a@16')), a),
1038
1039   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1040   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1041   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1042   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1043
1044   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1045   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1046   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1047
1048   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1049   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1050   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1051   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1052
1053   # Conversions to 16 bits would be lossy so they should only be removed if
1054   # the instruction was generated by the precision lowering pass.
1055   (('f2f32', ('f2fmp', 'a@32')), a),
1056   (('i2i32', ('i2imp', 'a@32')), a),
1057   (('u2u32', ('i2imp', 'a@32')), a),
1058
1059   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1060   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1061   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1062   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1063
1064   (('ffloor', 'a(is_integral)'), a),
1065   (('fceil', 'a(is_integral)'), a),
1066   (('ftrunc', 'a(is_integral)'), a),
1067   # fract(x) = x - floor(x), so fract(NaN) = NaN
1068   (('~ffract', 'a(is_integral)'), 0.0),
1069   (('fabs', 'a(is_not_negative)'), a),
1070   (('iabs', 'a(is_not_negative)'), a),
1071   (('fsat', 'a(is_not_positive)'), 0.0),
1072
1073   # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
1074   # says:
1075   #
1076   #    It is undefined to convert a negative floating-point value to an
1077   #    uint.
1078   #
1079   # Assuming that (uint)some_float behaves like (uint)(int)some_float allows
1080   # some optimizations in the i965 backend to proceed.
1081   (('ige', ('f2u', a), b), ('ige', ('f2i', a), b)),
1082   (('ige', b, ('f2u', a)), ('ige', b, ('f2i', a))),
1083   (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
1084   (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
1085
1086   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1087
1088   # The result of the multiply must be in [-1, 0], so the result of the ffma
1089   # must be in [0, 1].
1090   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1091   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1092   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1093   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1094
1095   (('fneu', 'a(is_not_zero)', 0.0), True),
1096   (('feq', 'a(is_not_zero)', 0.0), False),
1097
1098   # In this chart, + means value > 0 and - means value < 0.
1099   #
1100   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1101   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1102   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1103   #
1104   # Using grouping conceptually similar to a Karnaugh map...
1105   #
1106   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1107   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1108   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1109   #
1110   # The flt / ilt cases just invert the expected result.
1111   #
1112   # The results expecting true, must be marked imprecise.  The results
1113   # expecting false are fine because NaN compared >= or < anything is false.
1114
1115   (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1116   (('fge',  'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1117   (('fge',  'a(is_lt_zero)',      'b(is_not_negative)'), False),
1118
1119   (('flt',  'a(is_not_negative)', 'b(is_not_positive)'), False),
1120   (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1121   (('~flt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1122
1123   (('ine', 'a(is_not_zero)', 0), True),
1124   (('ieq', 'a(is_not_zero)', 0), False),
1125
1126   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1127   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1128   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1129
1130   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1131   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1132   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1133
1134   (('ult', 0, 'a(is_gt_zero)'), True),
1135   (('ult', a, 0), False),
1136
1137   # Packing and then unpacking does nothing
1138   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1139   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1140   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1141   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1142   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1143                           ('unpack_64_2x32_split_y', a)), a),
1144   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1145                              ('unpack_64_2x32_split_y', a))), a),
1146   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1147
1148   # Comparing two halves of an unpack separately.  While this optimization
1149   # should be correct for non-constant values, it's less obvious that it's
1150   # useful in that case.  For constant values, the pack will fold and we're
1151   # guaranteed to reduce the whole tree to one instruction.
1152   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1153             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1154    ('ieq', a, ('pack_32_2x16_split', b, c))),
1155
1156   # Byte extraction
1157   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1158   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1159   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1160   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1161   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1162   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1163   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1164
1165   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1166   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1167   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1168   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1169   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1170   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1171   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1172   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1173
1174    # Word extraction
1175   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1176   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1177   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1178   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1179   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1180
1181   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1182   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1183   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1184   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1185
1186   # Lower pack/unpack
1187   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1188   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),
1189   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1190   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1191   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),
1192   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),
1193
1194   # Useless masking before unpacking
1195   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1196   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1197   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1198   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1199   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1200   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1201
1202   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1203   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1204   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1205   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1206   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1207
1208   # Optimize half packing
1209   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1210   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1211
1212   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1213    ('pack_half_2x16', ('vec2', a, b))),
1214   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1215    ('pack_half_2x16', ('vec2', a, b))),
1216
1217   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1218   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1219   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1220
1221   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1222   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1223])
1224
1225# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1226# patterns like those below.
1227for op in ('ushr', 'ishr'):
1228   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1229   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1230   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1231
1232optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1233
1234# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1235# patterns like those below.
1236for op in ('extract_u8', 'extract_i8'):
1237   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1238   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1239   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1240
1241optimizations.extend([
1242   # Subtracts
1243   (('ussub_4x8', a, 0), a),
1244   (('ussub_4x8', a, ~0), 0),
1245   # Lower all Subtractions first - they can get recombined later
1246   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1247   (('isub', a, b), ('iadd', a, ('ineg', b))),
1248   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1249   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1250   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1251
1252   # Propagate negation up multiplication chains
1253   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1254   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1255
1256   # Propagate constants up multiplication chains
1257   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1258   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1259   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1260   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1261   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1262   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1263
1264   # Reassociate constants in add/mul chains so they can be folded together.
1265   # For now, we mostly only handle cases where the constants are separated by
1266   # a single non-constant.  We could do better eventually.
1267   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1268   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1269   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1270   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
1271   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
1272   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
1273   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
1274   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
1275
1276   # Drop mul-div by the same value when there's no wrapping.
1277   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
1278
1279   # By definition...
1280   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
1281   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1282   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1283
1284   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
1285   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1286   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1287
1288   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
1289
1290   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1291   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1292   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
1293
1294   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
1295   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
1296
1297   # Misc. lowering
1298   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
1299   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
1300   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
1301   (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
1302
1303   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1304    ('bcsel', ('ult', 31, 'bits'), 'insert',
1305              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
1306    'options->lower_bitfield_insert'),
1307   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1308   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1309   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1310   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1311   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1312   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1313   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1314   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1315
1316   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1317   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
1318   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
1319   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1320
1321   # int64_t sum = a + b;
1322   #
1323   # if (a < 0 && b < 0 && a < sum)
1324   #    sum = INT64_MIN;
1325   # } else if (a >= 0 && b >= 0 && sum < a)
1326   #    sum = INT64_MAX;
1327   # }
1328   #
1329   # A couple optimizations are applied.
1330   #
1331   # 1. a < sum => sum >= 0.  This replacement works because it is known that
1332   #    a < 0 and b < 0, so sum should also be < 0 unless there was
1333   #    underflow.
1334   #
1335   # 2. sum < a => sum < 0.  This replacement works because it is known that
1336   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
1337   #    overflow.
1338   #
1339   # 3. Invert the second if-condition and swap the order of parameters for
1340   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
1341   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
1342   #
1343   # On Intel Gen11, this saves ~11 instructions.
1344   (('iadd_sat@64', a, b), ('bcsel',
1345                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1346                            0x8000000000000000,
1347                            ('bcsel',
1348                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1349                             ('iadd', a, b),
1350                             0x7fffffffffffffff)),
1351    '(options->lower_int64_options & nir_lower_iadd64) != 0'),
1352
1353   # int64_t sum = a - b;
1354   #
1355   # if (a < 0 && b >= 0 && a < sum)
1356   #    sum = INT64_MIN;
1357   # } else if (a >= 0 && b < 0 && a >= sum)
1358   #    sum = INT64_MAX;
1359   # }
1360   #
1361   # Optimizations similar to the iadd_sat case are applied here.
1362   (('isub_sat@64', a, b), ('bcsel',
1363                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1364                            0x8000000000000000,
1365                            ('bcsel',
1366                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1367                             ('isub', a, b),
1368                             0x7fffffffffffffff)),
1369    '(options->lower_int64_options & nir_lower_iadd64) != 0'),
1370
1371   # These are done here instead of in the backend because the int64 lowering
1372   # pass will make a mess of the patterns.  The first patterns are
1373   # conditioned on nir_lower_minmax64 because it was not clear that it was
1374   # always an improvement on platforms that have real int64 support.  No
1375   # shaders in shader-db hit this, so it was hard to say one way or the
1376   # other.
1377   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1378   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1379   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1380   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1381   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1382   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1383
1384   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1385   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1386   # 0u < uint(a) <=> uint(a) != 0u
1387   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1388
1389   # Alternative lowering that doesn't rely on bfi.
1390   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1391    ('bcsel', ('ult', 31, 'bits'),
1392     'insert',
1393    (('ior',
1394     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
1395     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
1396    'options->lower_bitfield_insert_to_shifts'),
1397
1398   # Alternative lowering that uses bitfield_select.
1399   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1400    ('bcsel', ('ult', 31, 'bits'), 'insert',
1401              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
1402    'options->lower_bitfield_insert_to_bitfield_select'),
1403
1404   (('ibitfield_extract', 'value', 'offset', 'bits'),
1405    ('bcsel', ('ult', 31, 'bits'), 'value',
1406              ('ibfe', 'value', 'offset', 'bits')),
1407    'options->lower_bitfield_extract'),
1408
1409   (('ubitfield_extract', 'value', 'offset', 'bits'),
1410    ('bcsel', ('ult', 31, 'bits'), 'value',
1411              ('ubfe', 'value', 'offset', 'bits')),
1412    'options->lower_bitfield_extract'),
1413
1414   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
1415   (('bitfield_select', a, b, 0), ('iand', a, b)),
1416
1417   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
1418   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
1419   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
1420   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
1421   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
1422   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
1423   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
1424
1425   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
1426   #
1427   #    If bits is zero, the result will be zero.
1428   #
1429   # These patterns prevent other patterns from generating invalid results
1430   # when count is zero.
1431   (('ubfe', a, b, 0), 0),
1432   (('ibfe', a, b, 0), 0),
1433
1434   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
1435
1436   (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),
1437   (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
1438   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1439   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1440   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1441   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1442
1443   (('ibitfield_extract', 'value', 'offset', 'bits'),
1444    ('bcsel', ('ieq', 0, 'bits'),
1445     0,
1446     ('ishr',
1447       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
1448       ('isub', 32, 'bits'))),
1449    'options->lower_bitfield_extract_to_shifts'),
1450
1451   (('ubitfield_extract', 'value', 'offset', 'bits'),
1452    ('iand',
1453     ('ushr', 'value', 'offset'),
1454     ('bcsel', ('ieq', 'bits', 32),
1455      0xffffffff,
1456      ('isub', ('ishl', 1, 'bits'), 1))),
1457    'options->lower_bitfield_extract_to_shifts'),
1458
1459   (('ifind_msb', 'value'),
1460    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
1461    'options->lower_ifind_msb'),
1462
1463   (('find_lsb', 'value'),
1464    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
1465    'options->lower_find_lsb'),
1466
1467   (('extract_i8', a, 'b@32'),
1468    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
1469    'options->lower_extract_byte'),
1470
1471   (('extract_u8', a, 'b@32'),
1472    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
1473    'options->lower_extract_byte'),
1474
1475   (('extract_i16', a, 'b@32'),
1476    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
1477    'options->lower_extract_word'),
1478
1479   (('extract_u16', a, 'b@32'),
1480    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
1481    'options->lower_extract_word'),
1482
1483    (('pack_unorm_2x16', 'v'),
1484     ('pack_uvec2_to_uint',
1485        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
1486     'options->lower_pack_unorm_2x16'),
1487
1488    (('pack_unorm_4x8', 'v'),
1489     ('pack_uvec4_to_uint',
1490        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
1491     'options->lower_pack_unorm_4x8'),
1492
1493    (('pack_snorm_2x16', 'v'),
1494     ('pack_uvec2_to_uint',
1495        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
1496     'options->lower_pack_snorm_2x16'),
1497
1498    (('pack_snorm_4x8', 'v'),
1499     ('pack_uvec4_to_uint',
1500        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
1501     'options->lower_pack_snorm_4x8'),
1502
1503    (('unpack_unorm_2x16', 'v'),
1504     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
1505                                  ('extract_u16', 'v', 1))),
1506              65535.0),
1507     'options->lower_unpack_unorm_2x16'),
1508
1509    (('unpack_unorm_4x8', 'v'),
1510     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
1511                                  ('extract_u8', 'v', 1),
1512                                  ('extract_u8', 'v', 2),
1513                                  ('extract_u8', 'v', 3))),
1514              255.0),
1515     'options->lower_unpack_unorm_4x8'),
1516
1517    (('unpack_snorm_2x16', 'v'),
1518     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
1519                                                            ('extract_i16', 'v', 1))),
1520                                           32767.0))),
1521     'options->lower_unpack_snorm_2x16'),
1522
1523    (('unpack_snorm_4x8', 'v'),
1524     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
1525                                                            ('extract_i8', 'v', 1),
1526                                                            ('extract_i8', 'v', 2),
1527                                                            ('extract_i8', 'v', 3))),
1528                                           127.0))),
1529     'options->lower_unpack_snorm_4x8'),
1530
1531   (('pack_half_2x16_split', 'a@32', 'b@32'),
1532    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
1533    'options->lower_pack_split'),
1534
1535   (('unpack_half_2x16_split_x', 'a@32'),
1536    ('f2f32', ('u2u16', a)),
1537    'options->lower_pack_split'),
1538
1539   (('unpack_half_2x16_split_y', 'a@32'),
1540    ('f2f32', ('u2u16', ('ushr', a, 16))),
1541    'options->lower_pack_split'),
1542
1543   (('pack_32_2x16_split', 'a@16', 'b@16'),
1544    ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
1545    'options->lower_pack_split'),
1546
1547   (('unpack_32_2x16_split_x', 'a@32'),
1548    ('u2u16', a),
1549    'options->lower_pack_split'),
1550
1551   (('unpack_32_2x16_split_y', 'a@32'),
1552    ('u2u16', ('ushr', 'a', 16)),
1553    'options->lower_pack_split'),
1554
1555   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
1556   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
1557   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
1558   (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
1559
1560   # Address/offset calculations:
1561   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
1562   # rule converts everyone else to imul:
1563   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
1564
1565   (('umul24', a, b),
1566    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
1567    '!options->has_umul24'),
1568   (('umad24', a, b, c),
1569    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
1570    '!options->has_umad24'),
1571
1572   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
1573   (('imad24_ir3', a, 0, c), (c)),
1574   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
1575
1576   # if first two srcs are const, crack apart the imad so constant folding
1577   # can clean up the imul:
1578   # TODO ffma should probably get a similar rule:
1579   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
1580
1581   # These will turn 24b address/offset calc back into 32b shifts, but
1582   # it should be safe to get back some of the bits of precision that we
1583   # already decided were no necessary:
1584   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
1585   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
1586   (('imul24', a, 0), (0)),
1587])
1588
1589# bit_size dependent lowerings
1590for bit_size in [8, 16, 32, 64]:
1591   # convenience constants
1592   intmax = (1 << (bit_size - 1)) - 1
1593   intmin = 1 << (bit_size - 1)
1594
1595   optimizations += [
1596      (('iadd_sat@' + str(bit_size), a, b),
1597       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
1598                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'),
1599      (('isub_sat@' + str(bit_size), a, b),
1600       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
1601                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
1602   ]
1603
1604invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
1605
1606for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
1607   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
1608                         ('iand', (invert[left], a, b), (invert[right], c, d))))
1609   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
1610                         ('ior', (invert[left], a, b), (invert[right], c, d))))
1611
1612# Optimize x2bN(b2x(x)) -> x
1613for size in type_sizes('bool'):
1614    aN = 'a@' + str(size)
1615    f2bN = 'f2b' + str(size)
1616    i2bN = 'i2b' + str(size)
1617    optimizations.append(((f2bN, ('b2f', aN)), a))
1618    optimizations.append(((i2bN, ('b2i', aN)), a))
1619
1620# Optimize x2yN(b2x(x)) -> b2y
1621for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
1622   if x != 'f' and y != 'f' and x != y:
1623      continue
1624
1625   b2x = 'b2f' if x == 'f' else 'b2i'
1626   b2y = 'b2f' if y == 'f' else 'b2i'
1627   x2yN = '{}2{}'.format(x, y)
1628   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
1629
1630# Optimize away x2xN(a@N)
1631for t in ['int', 'uint', 'float', 'bool']:
1632   for N in type_sizes(t):
1633      x2xN = '{0}2{0}{1}'.format(t[0], N)
1634      aN = 'a@{0}'.format(N)
1635      optimizations.append(((x2xN, aN), a))
1636
1637# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
1638# In particular, we can optimize away everything except upcast of downcast and
1639# upcasts where the type differs from the other cast
1640for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
1641   if N < M:
1642      # The outer cast is a down-cast.  It doesn't matter what the size of the
1643      # argument of the inner cast is because we'll never been in the upcast
1644      # of downcast case.  Regardless of types, we'll always end up with y2yN
1645      # in the end.
1646      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
1647         x2xN = '{0}2{0}{1}'.format(x, N)
1648         y2yM = '{0}2{0}{1}'.format(y, M)
1649         y2yN = '{0}2{0}{1}'.format(y, N)
1650         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
1651   elif N > M:
1652      # If the outer cast is an up-cast, we have to be more careful about the
1653      # size of the argument of the inner cast and with types.  In this case,
1654      # the type is always the type of type up-cast which is given by the
1655      # outer cast.
1656      for P in type_sizes('uint'):
1657         # We can't optimize away up-cast of down-cast.
1658         if M < P:
1659            continue
1660
1661         # Because we're doing down-cast of down-cast, the types always have
1662         # to match between the two casts
1663         for x in ['i', 'u']:
1664            x2xN = '{0}2{0}{1}'.format(x, N)
1665            x2xM = '{0}2{0}{1}'.format(x, M)
1666            aP = 'a@{0}'.format(P)
1667            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
1668   else:
1669      # The N == M case is handled by other optimizations
1670      pass
1671
1672# Downcast operations should be able to see through pack
1673for t in ['i', 'u']:
1674    for N in [8, 16, 32]:
1675        x2xN = '{0}2{0}{1}'.format(t, N)
1676        optimizations += [
1677            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
1678            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
1679        ]
1680
1681# Optimize comparisons with up-casts
1682for t in ['int', 'uint', 'float']:
1683    for N, M in itertools.product(type_sizes(t), repeat=2):
1684        if N == 1 or N >= M:
1685            continue
1686
1687        cond = 'true'
1688        if N == 8:
1689            cond = 'options->support_8bit_alu'
1690        elif N == 16:
1691            cond = 'options->support_16bit_alu'
1692        x2xM = '{0}2{0}{1}'.format(t[0], M)
1693        x2xN = '{0}2{0}{1}'.format(t[0], N)
1694        aN = 'a@' + str(N)
1695        bN = 'b@' + str(N)
1696        xeq = 'feq' if t == 'float' else 'ieq'
1697        xne = 'fneu' if t == 'float' else 'ine'
1698        xge = '{0}ge'.format(t[0])
1699        xlt = '{0}lt'.format(t[0])
1700
1701        # Up-casts are lossless so for correctly signed comparisons of
1702        # up-casted values we can do the comparison at the largest of the two
1703        # original sizes and drop one or both of the casts.  (We have
1704        # optimizations to drop the no-op casts which this may generate.)
1705        for P in type_sizes(t):
1706            if P == 1 or P > N:
1707                continue
1708
1709            bP = 'b@' + str(P)
1710            optimizations += [
1711                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
1712                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
1713                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
1714                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
1715                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
1716                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
1717            ]
1718
1719        # The next bit doesn't work on floats because the range checks would
1720        # get way too complicated.
1721        if t in ['int', 'uint']:
1722            if t == 'int':
1723                xN_min = -(1 << (N - 1))
1724                xN_max = (1 << (N - 1)) - 1
1725            elif t == 'uint':
1726                xN_min = 0
1727                xN_max = (1 << N) - 1
1728            else:
1729                assert False
1730
1731            # If we're up-casting and comparing to a constant, we can unfold
1732            # the comparison into a comparison with the shrunk down constant
1733            # and a check that the constant fits in the smaller bit size.
1734            optimizations += [
1735                ((xeq, (x2xM, aN), '#b'),
1736                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
1737                ((xne, (x2xM, aN), '#b'),
1738                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
1739                ((xlt, (x2xM, aN), '#b'),
1740                 ('iand', (xlt, xN_min, b),
1741                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
1742                ((xlt, '#a', (x2xM, bN)),
1743                 ('iand', (xlt, a, xN_max),
1744                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
1745                ((xge, (x2xM, aN), '#b'),
1746                 ('iand', (xge, xN_max, b),
1747                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
1748                ((xge, '#a', (x2xM, bN)),
1749                 ('iand', (xge, a, xN_min),
1750                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
1751            ]
1752
1753# Convert masking followed by signed downcast to just unsigned downcast
1754optimizations += [
1755    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
1756    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
1757    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
1758    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
1759    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
1760    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
1761]
1762
1763def fexp2i(exp, bits):
1764   # Generate an expression which constructs value 2.0^exp or 0.0.
1765   #
1766   # We assume that exp is already in a valid range:
1767   #
1768   #   * [-15, 15] for 16-bit float
1769   #   * [-127, 127] for 32-bit float
1770   #   * [-1023, 1023] for 16-bit float
1771   #
1772   # If exp is the lowest value in the valid range, a value of 0.0 is
1773   # constructed.  Otherwise, the value 2.0^exp is constructed.
1774   if bits == 16:
1775      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
1776   elif bits == 32:
1777      return ('ishl', ('iadd', exp, 127), 23)
1778   elif bits == 64:
1779      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
1780   else:
1781      assert False
1782
1783def ldexp(f, exp, bits):
1784   # The maximum possible range for a normal exponent is [-126, 127] and,
1785   # throwing in denormals, you get a maximum range of [-149, 127].  This
1786   # means that we can potentially have a swing of +-276.  If you start with
1787   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
1788   # all the way to zero.  The GLSL spec only requires that we handle a subset
1789   # of this range.  From version 4.60 of the spec:
1790   #
1791   #    "If exp is greater than +128 (single-precision) or +1024
1792   #    (double-precision), the value returned is undefined. If exp is less
1793   #    than -126 (single-precision) or -1022 (double-precision), the value
1794   #    returned may be flushed to zero. Additionally, splitting the value
1795   #    into a significand and exponent using frexp() and then reconstructing
1796   #    a floating-point value using ldexp() should yield the original input
1797   #    for zero and all finite non-denormalized values."
1798   #
1799   # The SPIR-V spec has similar language.
1800   #
1801   # In order to handle the maximum value +128 using the fexp2i() helper
1802   # above, we have to split the exponent in half and do two multiply
1803   # operations.
1804   #
1805   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
1806   # twice the full range that is valid for the fexp2i() function above.  If
1807   # exp/2 is the bottom value of that range, the fexp2i() expression will
1808   # yield 0.0f which, when multiplied by f, will flush it to zero which is
1809   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
1810   # value is clamped from above, then it must have been above the supported
1811   # range of the GLSL built-in and therefore any return value is acceptable.
1812   if bits == 16:
1813      exp = ('imin', ('imax', exp, -30), 30)
1814   elif bits == 32:
1815      exp = ('imin', ('imax', exp, -254), 254)
1816   elif bits == 64:
1817      exp = ('imin', ('imax', exp, -2046), 2046)
1818   else:
1819      assert False
1820
1821   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
1822   # (We use ishr which isn't the same for -1, but the -1 case still works
1823   # since we use exp-exp/2 as the second exponent.)  While the spec
1824   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
1825   # work with denormals and doesn't allow for the full swing in exponents
1826   # that you can get with normalized values.  Instead, we create two powers
1827   # of two and multiply by them each in turn.  That way the effective range
1828   # of our exponent is doubled.
1829   pow2_1 = fexp2i(('ishr', exp, 1), bits)
1830   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
1831   return ('fmul', ('fmul', f, pow2_1), pow2_2)
1832
1833optimizations += [
1834   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
1835   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
1836   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
1837]
1838
1839# Unreal Engine 4 demo applications open-codes bitfieldReverse()
1840def bitfield_reverse(u):
1841    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
1842    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
1843    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
1844    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
1845    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
1846
1847    return step5
1848
1849optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
1850
1851# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
1852# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
1853for ncomp in [2, 3, 4, 8, 16]:
1854   optimizations += [
1855      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
1856      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
1857      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
1858      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
1859   ]
1860
1861# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
1862# then the "a == a" is redundant because it's equivalent to "a is not NaN"
1863# and, if a is a NaN then the second comparison will fail anyway.
1864for op in ['flt', 'fge', 'feq']:
1865   optimizations += [
1866      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
1867      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
1868   ]
1869
1870# Add optimizations to handle the case where the result of a ternary is
1871# compared to a constant.  This way we can take things like
1872#
1873# (a ? 0 : 1) > 0
1874#
1875# and turn it into
1876#
1877# a ? (0 > 0) : (1 > 0)
1878#
1879# which constant folding will eat for lunch.  The resulting ternary will
1880# further get cleaned up by the boolean reductions above and we will be
1881# left with just the original variable "a".
1882for op in ['flt', 'fge', 'feq', 'fneu',
1883           'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
1884   optimizations += [
1885      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
1886       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
1887      ((op, '#d', ('bcsel', a, '#b', '#c')),
1888       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
1889   ]
1890
1891
1892# For example, this converts things like
1893#
1894#    1 + mix(0, a - 1, condition)
1895#
1896# into
1897#
1898#    mix(1, (a-1)+1, condition)
1899#
1900# Other optimizations will rearrange the constants.
1901for op in ['fadd', 'fmul', 'iadd', 'imul']:
1902   optimizations += [
1903      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
1904   ]
1905
1906# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
1907# states:
1908#
1909#     If neither layout qualifier is specified, derivatives in compute shaders
1910#     return zero, which is consistent with the handling of built-in texture
1911#     functions like texture() in GLSL 4.50 compute shaders.
1912for op in ['fddx', 'fddx_fine', 'fddx_coarse',
1913           'fddy', 'fddy_fine', 'fddy_coarse']:
1914   optimizations += [
1915      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
1916]
1917
1918# Some optimizations for ir3-specific instructions.
1919optimizations += [
1920   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
1921   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
1922   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
1923   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
1924   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
1925]
1926
1927# These kinds of sequences can occur after nir_opt_peephole_select.
1928#
1929# NOTE: fadd is not handled here because that gets in the way of ffma
1930# generation in the i965 driver.  Instead, fadd and ffma are handled in
1931# late_optimizations.
1932
1933for op in ['flrp']:
1934    optimizations += [
1935        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
1936        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
1937        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
1938        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
1939        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
1940        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
1941    ]
1942
1943for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
1944    optimizations += [
1945        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
1946        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
1947        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
1948        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
1949    ]
1950
1951for op in ['fpow']:
1952    optimizations += [
1953        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
1954        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
1955        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
1956        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
1957    ]
1958
1959for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']:
1960    optimizations += [
1961        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
1962    ]
1963
1964for op in ['ineg', 'iabs', 'inot', 'isign']:
1965    optimizations += [
1966        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
1967    ]
1968
1969# This section contains optimizations to propagate downsizing conversions of
1970# constructed vectors into vectors of downsized components. Whether this is
1971# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
1972# this reduces the register pressure of the vector itself and often enables the
1973# conversions to be eliminated via other algebraic rules or constant folding.
1974# In the worst case on a SIMD architecture, the propagated conversions may be
1975# revectorized via nir_opt_vectorize so instruction count is minimally
1976# impacted.
1977#
1978# On a machine with SIMD-within-a-register only, this actually
1979# counterintuitively hurts instruction count. These machines are the same that
1980# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
1981# not being set.
1982#
1983# Finally for scalar architectures, there should be no difference in generated
1984# code since it all ends up scalarized at the end, but it might minimally help
1985# compile-times.
1986
1987for i in range(2, 4 + 1):
1988   for T in ('f', 'u', 'i'):
1989      vec_inst = ('vec' + str(i),)
1990
1991      indices = ['a', 'b', 'c', 'd']
1992      suffix_in = tuple((indices[j] + '@32') for j in range(i))
1993
1994      to_16 = '{}2{}16'.format(T, T)
1995      to_mp = '{}2{}mp'.format(T, T)
1996
1997      out_16 = tuple((to_16, indices[j]) for j in range(i))
1998      out_mp = tuple((to_mp, indices[j]) for j in range(i))
1999
2000      optimizations  += [
2001         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
2002      ]
2003      # u2ump doesn't exist, because it's equal to i2imp
2004      if T in ['f', 'i']:
2005          optimizations  += [
2006             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
2007          ]
2008
2009# This section contains "late" optimizations that should be run before
2010# creating ffmas and calling regular optimizations for the final time.
2011# Optimizations should go here if they help code generation and conflict
2012# with the regular optimizations.
2013before_ffma_optimizations = [
2014   # Propagate constants down multiplication chains
2015   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
2016   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
2017   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
2018   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
2019
2020   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
2021   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
2022   (('~fadd', ('fneg', a), a), 0.0),
2023   (('iadd', ('ineg', a), a), 0),
2024   (('iadd', ('ineg', a), ('iadd', a, b)), b),
2025   (('iadd', a, ('iadd', ('ineg', a), b)), b),
2026   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
2027   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
2028
2029   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
2030   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
2031   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
2032]
2033
2034# This section contains "late" optimizations that should be run after the
2035# regular optimizations have finished.  Optimizations should go here if
2036# they help code generation but do not necessarily produce code that is
2037# more easily optimizable.
2038late_optimizations = [
2039   # Most of these optimizations aren't quite safe when you get infinity or
2040   # Nan involved but the first one should be fine.
2041   (('flt',          ('fadd', a, b),  0.0), ('flt',          a, ('fneg', b))),
2042   (('flt', ('fneg', ('fadd', a, b)), 0.0), ('flt', ('fneg', a),         b)),
2043   (('~fge',          ('fadd', a, b),  0.0), ('fge',          a, ('fneg', b))),
2044   (('~fge', ('fneg', ('fadd', a, b)), 0.0), ('fge', ('fneg', a),         b)),
2045   (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
2046   (('~fneu', ('fadd', a, b), 0.0), ('fneu', a, ('fneg', b))),
2047
2048   # nir_lower_to_source_mods will collapse this, but its existence during the
2049   # optimization loop can prevent other optimizations.
2050   (('fneg', ('fneg', a)), a),
2051
2052   # Subtractions get lowered during optimization, so we need to recombine them
2053   (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'),
2054   (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
2055   (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
2056   (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
2057   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
2058   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
2059   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
2060   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
2061
2062   # These are duplicated from the main optimizations table.  The late
2063   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
2064   # new patterns like these.  The patterns that compare with zero are removed
2065   # because they are unlikely to be created in by anything in
2066   # late_optimizations.
2067   (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
2068   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
2069   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
2070   (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
2071   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
2072   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
2073
2074   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
2075   # flt(fsat(a), 1.0) is inexact because it returns True if a is NaN
2076   # (fsat(NaN) is 0), while flt(a, 1.0) always returns FALSE.
2077   (('~flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
2078
2079   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
2080
2081   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
2082   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
2083   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
2084   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
2085   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
2086   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
2087   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
2088   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
2089   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
2090   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
2091
2092   (('ior', a, a), a),
2093   (('iand', a, a), a),
2094
2095   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
2096
2097   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
2098   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
2099   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
2100   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
2101
2102   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
2103
2104   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
2105   # particular operation is common for expanding values stored in a texture
2106   # from [0,1] to [-1,1].
2107   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2108   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2109   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2110   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2111   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2112   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2113   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2114   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2115
2116    # flrp(a, b, a)
2117    # a*(1-a) + b*a
2118    # a + -a*a + a*b    (1)
2119    # a + a*(b - a)
2120    # Option 1: ffma(a, (b-a), a)
2121    #
2122    # Alternately, after (1):
2123    # a*(1+b) + -a*a
2124    # a*((1+b) + -a)
2125    #
2126    # Let b=1
2127    #
2128    # Option 2: ffma(a, 2, -(a*a))
2129    # Option 3: ffma(a, 2, (-a)*a)
2130    # Option 4: ffma(a, -a, (2*a)
2131    # Option 5: a * (2 - a)
2132    #
2133    # There are a lot of other possible combinations.
2134   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
2135   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2136   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2137   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2138   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2139
2140   # we do these late so that we don't get in the way of creating ffmas
2141   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
2142   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
2143
2144   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
2145   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
2146   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
2147    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
2148
2149   # Things that look like DPH in the source shader may get expanded to
2150   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
2151   # to NIR.  After FFMA is generated, this can look like:
2152   #
2153   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
2154   #
2155   # Reassociate the last addition into the first multiplication.
2156   #
2157   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
2158   # shader stages on some outputs that are intended to be invariant.  For
2159   # various reasons, this optimization may not be fully applied in all
2160   # shaders used for different rendering passes of the same geometry.  This
2161   # can result in Z-fighting artifacts (at best).  For now, disable this
2162   # optimization in these stages.  See bugzilla #111490.  In tessellation
2163   # stages applications seem to use 'precise' when necessary, so allow the
2164   # optimization in those stages.
2165   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2166    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2167   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2168    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2169
2170   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2171   #
2172   #    If bits is zero, the result will be zero.
2173   #
2174   # These prevent the next two lowerings generating incorrect results when
2175   # count is zero.
2176   (('ubfe', a, b, 0), 0),
2177   (('ibfe', a, b, 0), 0),
2178
2179   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
2180   # instructions on Intel GPUs, it cannot have an immediate values as
2181   # sources.  There are also limitations on source register strides.  As a
2182   # result, it is very easy for 3-source instruction combined with either
2183   # loads of immediate values or copies from weird register strides to be
2184   # more expensive than the primitive instructions it represents.
2185   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->lower_bfe_with_two_constants'),
2186
2187   # b is the lowest order bit to be extracted and c is the number of bits to
2188   # extract.  The inner shift removes the bits above b + c by shifting left
2189   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
2190   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
2191   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
2192   # This means that it must be shifted right by 32 - c or -c bits.
2193   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->lower_bfe_with_two_constants'),
2194
2195   # Clean up no-op shifts that may result from the bfe lowerings.
2196   (('ishl', a, 0), a),
2197   (('ishl', a, -32), a),
2198   (('ishr', a, 0), a),
2199   (('ishr', a, -32), a),
2200   (('ushr', a, 0), a),
2201]
2202
2203# Integer sizes
2204for s in [8, 16, 32, 64]:
2205    late_optimizations.extend([
2206        (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),
2207        (('ior',  ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),
2208    ])
2209
2210# Float sizes
2211for s in [16, 32, 64]:
2212    late_optimizations.extend([
2213       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
2214       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
2215    ])
2216
2217for op in ['fadd']:
2218    late_optimizations += [
2219        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2220        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2221    ]
2222
2223for op in ['ffma']:
2224    late_optimizations += [
2225        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2226        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2227
2228        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2229        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2230    ]
2231
2232# mediump: If an opcode is surrounded by conversions, remove the conversions.
2233# The rationale is that type conversions + the low precision opcode are more
2234# expensive that the same arithmetic opcode at higher precision.
2235#
2236# This must be done in late optimizations, because we need normal optimizations to
2237# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
2238#
2239# Unary opcodes
2240for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy',
2241           'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
2242           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
2243    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
2244
2245# Binary opcodes
2246for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
2247    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
2248
2249# Ternary opcodes
2250for op in ['ffma', 'flrp']:
2251    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
2252
2253# Comparison opcodes
2254for op in ['feq', 'fge', 'flt', 'fneu']:
2255    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
2256
2257# Do this last, so that the f2fmp patterns above have effect.
2258late_optimizations += [
2259  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
2260  # any conversions that could have been removed will have been removed in
2261  # nir_opt_algebraic so any remaining ones are required.
2262  (('f2fmp', a), ('f2f16', a)),
2263  (('f2imp', a), ('f2i16', a)),
2264  (('f2ump', a), ('f2u16', a)),
2265  (('i2imp', a), ('i2i16', a)),
2266  (('i2fmp', a), ('i2f16', a)),
2267  (('i2imp', a), ('u2u16', a)),
2268  (('u2fmp', a), ('u2f16', a)),
2269]
2270
2271distribute_src_mods = [
2272   # Try to remove some spurious negations rather than pushing them down.
2273   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
2274   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
2275   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
2276   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
2277   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
2278   (('fneg', ('fneg', a)), a),
2279
2280   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
2281   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
2282
2283   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
2284   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
2285   (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
2286
2287   # Note that fmin <-> fmax.  I don't think there is a way to distribute
2288   # fabs() into fmin or fmax.
2289   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
2290   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
2291
2292   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
2293   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
2294   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
2295
2296   # fdph works mostly like fdot, but to get the correct result, the negation
2297   # must be applied to the second source.
2298   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
2299
2300   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
2301   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
2302]
2303
2304print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
2305print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
2306                                  before_ffma_optimizations).render())
2307print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
2308                                  late_optimizations).render())
2309print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
2310                                  distribute_src_mods).render())
2311