• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /**
25  * \file lower_instructions.cpp
26  *
27  * Many GPUs lack native instructions for certain expression operations, and
28  * must replace them with some other expression tree.  This pass lowers some
29  * of the most common cases, allowing the lowering code to be implemented once
30  * rather than in each driver backend.
31  *
32  * Currently supported transformations:
33  * - SUB_TO_ADD_NEG
34  * - LDEXP_TO_ARITH
35  * - CARRY_TO_ARITH
36  * - BORROW_TO_ARITH
37  * - DOPS_TO_DFRAC
38  *
39  * SUB_TO_ADD_NEG:
40  * ---------------
41  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
42  *
43  * This simplifies expression reassociation, and for many backends
44  * there is no subtract operation separate from adding the negation.
45  * For backends with native subtract operations, they will probably
46  * want to recognize add(op0, neg(op1)) or the other way around to
47  * produce a subtract anyway.
48  *
49  * LDEXP_TO_ARITH:
50  * -------------
51  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
52  *
53  * DFREXP_DLDEXP_TO_ARITH:
54  * ---------------
55  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
56  * arithmetic and bit ops for double arguments.
57  *
58  * CARRY_TO_ARITH:
59  * ---------------
60  * Converts ir_carry into (x + y) < x.
61  *
62  * BORROW_TO_ARITH:
63  * ----------------
64  * Converts ir_borrow into (x < y).
65  *
66  * DOPS_TO_DFRAC:
67  * --------------
68  * Converts double trunc, ceil, floor, round to fract
69  */
70 
71 #include "program/prog_instruction.h" /* for swizzle */
72 #include "compiler/glsl_types.h"
73 #include "ir.h"
74 #include "ir_builder.h"
75 #include "ir_optimization.h"
76 #include "util/half_float.h"
77 
78 #include <math.h>
79 
80 using namespace ir_builder;
81 
82 namespace {
83 
84 class lower_instructions_visitor : public ir_hierarchical_visitor {
85 public:
lower_instructions_visitor(unsigned lower)86    lower_instructions_visitor(unsigned lower)
87       : progress(false), lower(lower) { }
88 
89    ir_visitor_status visit_leave(ir_expression *);
90 
91    bool progress;
92 
93 private:
94    unsigned lower; /** Bitfield of which operations to lower */
95 
96    void sub_to_add_neg(ir_expression *);
97    void ldexp_to_arith(ir_expression *);
98    void dldexp_to_arith(ir_expression *);
99    void dfrexp_sig_to_arith(ir_expression *);
100    void dfrexp_exp_to_arith(ir_expression *);
101    void carry_to_arith(ir_expression *);
102    void borrow_to_arith(ir_expression *);
103    void double_dot_to_fma(ir_expression *);
104    void double_lrp(ir_expression *);
105    void dceil_to_dfrac(ir_expression *);
106    void dfloor_to_dfrac(ir_expression *);
107    void dround_even_to_dfrac(ir_expression *);
108    void dtrunc_to_dfrac(ir_expression *);
109    void dsign_to_csel(ir_expression *);
110    void bit_count_to_math(ir_expression *);
111    void extract_to_shifts(ir_expression *);
112    void insert_to_shifts(ir_expression *);
113    void reverse_to_shifts(ir_expression *ir);
114    void find_lsb_to_float_cast(ir_expression *ir);
115    void find_msb_to_float_cast(ir_expression *ir);
116    void imul_high_to_mul(ir_expression *ir);
117    void sqrt_to_abs_sqrt(ir_expression *ir);
118 
119    ir_expression *_carry(operand a, operand b);
120 
121    static ir_constant *_imm_fp(void *mem_ctx,
122                                const glsl_type *type,
123                                double f,
124                                unsigned vector_elements=1);
125 };
126 
127 } /* anonymous namespace */
128 
129 /**
130  * Determine if a particular type of lowering should occur
131  */
132 #define lowering(x) (this->lower & x)
133 
134 bool
lower_instructions(exec_list * instructions,unsigned what_to_lower)135 lower_instructions(exec_list *instructions, unsigned what_to_lower)
136 {
137    lower_instructions_visitor v(what_to_lower);
138 
139    visit_list_elements(&v, instructions);
140    return v.progress;
141 }
142 
143 void
sub_to_add_neg(ir_expression * ir)144 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
145 {
146    ir->operation = ir_binop_add;
147    ir->init_num_operands();
148    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
149 					   ir->operands[1], NULL);
150    this->progress = true;
151 }
152 
153 void
ldexp_to_arith(ir_expression * ir)154 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
155 {
156    /* Translates
157     *    ir_binop_ldexp x exp
158     * into
159     *
160     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
161     *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
162     *
163     *    if (extracted_biased_exp >= 255)
164     *       return x; // +/-inf, NaN
165     *
166     *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
167     *
168     *    if (min(resulting_biased_exp, extracted_biased_exp) < 1)
169     *       resulting_biased_exp = 0;
170     *    if (resulting_biased_exp >= 255 ||
171     *        min(resulting_biased_exp, extracted_biased_exp) < 1) {
172     *       sign_mantissa &= sign_mask;
173     *    }
174     *
175     *    return bitcast_u2f(sign_mantissa |
176     *                       lshift(i2u(resulting_biased_exp), exp_shift));
177     *
178     * which we can't actually implement as such, since the GLSL IR doesn't
179     * have vectorized if-statements. We actually implement it without branches
180     * using conditional-select:
181     *
182     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
183     *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
184     *
185     *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
186     *
187     *    flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0);
188     *    resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp)
189     *    zero_mantissa = logic_or(flush_to_zero,
190     *                             gequal(resulting_biased_exp, 255));
191     *    sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa);
192     *
193     *    result = sign_mantissa |
194     *             lshift(i2u(resulting_biased_exp), exp_shift));
195     *
196     *    return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result));
197     *
198     * The definition of ldexp in the GLSL spec says:
199     *
200     *    "If this product is too large to be represented in the
201     *     floating-point type, the result is undefined."
202     *
203     * However, the definition of ldexp in the GLSL ES spec does not contain
204     * this sentence, so we do need to handle overflow correctly.
205     *
206     * There is additional language limiting the defined range of exp, but this
207     * is merely to allow implementations that store 2^exp in a temporary
208     * variable.
209     */
210 
211    const unsigned vec_elem = ir->type->vector_elements;
212 
213    /* Types */
214    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
215    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
216    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
217 
218    /* Temporary variables */
219    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
220    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
221    ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary);
222 
223    ir_variable *extracted_biased_exp =
224       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
225    ir_variable *resulting_biased_exp =
226       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
227 
228    ir_variable *sign_mantissa =
229       new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary);
230 
231    ir_variable *flush_to_zero =
232       new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary);
233    ir_variable *zero_mantissa =
234       new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary);
235 
236    ir_instruction &i = *base_ir;
237 
238    /* Copy <x> and <exp> arguments. */
239    i.insert_before(x);
240    i.insert_before(assign(x, ir->operands[0]));
241    i.insert_before(exp);
242    i.insert_before(assign(exp, ir->operands[1]));
243 
244    /* Extract the biased exponent from <x>. */
245    i.insert_before(extracted_biased_exp);
246    i.insert_before(assign(extracted_biased_exp,
247                           rshift(bitcast_f2i(abs(x)),
248                                  new(ir) ir_constant(23, vec_elem))));
249 
250    /* The definition of ldexp in the GLSL 4.60 spec says:
251     *
252     *    "If exp is greater than +128 (single-precision) or +1024
253     *     (double-precision), the value returned is undefined. If exp is less
254     *     than -126 (single-precision) or -1022 (double-precision), the value
255     *     returned may be flushed to zero."
256     *
257     * So we do not have to guard against the possibility of addition overflow,
258     * which could happen when exp is close to INT_MAX. Addition underflow
259     * cannot happen (the worst case is 0 + (-INT_MAX)).
260     */
261    i.insert_before(resulting_biased_exp);
262    i.insert_before(assign(resulting_biased_exp,
263                           min2(add(extracted_biased_exp, exp),
264                                new(ir) ir_constant(255, vec_elem))));
265 
266    i.insert_before(sign_mantissa);
267    i.insert_before(assign(sign_mantissa,
268                           bit_and(bitcast_f2u(x),
269                                   new(ir) ir_constant(0x807fffffu, vec_elem))));
270 
271    /* We flush to zero if the original or resulting biased exponent is 0,
272     * indicating a +/-0.0 or subnormal input or output.
273     *
274     * The mantissa is set to 0 if the resulting biased exponent is 255, since
275     * an overflow should produce a +/-inf result.
276     *
277     * Note that NaN inputs are handled separately.
278     */
279    i.insert_before(flush_to_zero);
280    i.insert_before(assign(flush_to_zero,
281                           lequal(min2(resulting_biased_exp,
282                                       extracted_biased_exp),
283                                  ir_constant::zero(ir, ivec))));
284    i.insert_before(assign(resulting_biased_exp,
285                           csel(flush_to_zero,
286                                ir_constant::zero(ir, ivec),
287                                resulting_biased_exp)));
288 
289    i.insert_before(zero_mantissa);
290    i.insert_before(assign(zero_mantissa,
291                           logic_or(flush_to_zero,
292                                    equal(resulting_biased_exp,
293                                          new(ir) ir_constant(255, vec_elem)))));
294    i.insert_before(assign(sign_mantissa,
295                           csel(zero_mantissa,
296                                bit_and(sign_mantissa,
297                                        new(ir) ir_constant(0x80000000u, vec_elem)),
298                                sign_mantissa)));
299 
300    /* Don't generate new IR that would need to be lowered in an additional
301     * pass.
302     */
303    i.insert_before(result);
304    if (!lowering(INSERT_TO_SHIFTS)) {
305       i.insert_before(assign(result,
306                              bitfield_insert(sign_mantissa,
307                                              i2u(resulting_biased_exp),
308                                              new(ir) ir_constant(23u, vec_elem),
309                                              new(ir) ir_constant(8u, vec_elem))));
310    } else {
311       i.insert_before(assign(result,
312                              bit_or(sign_mantissa,
313                                     lshift(i2u(resulting_biased_exp),
314                                            new(ir) ir_constant(23, vec_elem)))));
315    }
316 
317    ir->operation = ir_triop_csel;
318    ir->init_num_operands();
319    ir->operands[0] = gequal(extracted_biased_exp,
320                             new(ir) ir_constant(255, vec_elem));
321    ir->operands[1] = new(ir) ir_dereference_variable(x);
322    ir->operands[2] = bitcast_u2f(result);
323 
324    this->progress = true;
325 }
326 
327 void
dldexp_to_arith(ir_expression * ir)328 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
329 {
330    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
331     * from the significand.
332     */
333 
334    const unsigned vec_elem = ir->type->vector_elements;
335 
336    /* Types */
337    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
338    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
339 
340    /* Constants */
341    ir_constant *zeroi = ir_constant::zero(ir, ivec);
342 
343    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
344 
345    ir_constant *exp_shift = new(ir) ir_constant(20u);
346    ir_constant *exp_width = new(ir) ir_constant(11u);
347    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
348 
349    /* Temporary variables */
350    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
351    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
352 
353    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
354                                                   ir_var_temporary);
355 
356    ir_variable *extracted_biased_exp =
357       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
358    ir_variable *resulting_biased_exp =
359       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
360 
361    ir_variable *is_not_zero_or_underflow =
362       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
363 
364    ir_instruction &i = *base_ir;
365 
366    /* Copy <x> and <exp> arguments. */
367    i.insert_before(x);
368    i.insert_before(assign(x, ir->operands[0]));
369    i.insert_before(exp);
370    i.insert_before(assign(exp, ir->operands[1]));
371 
372    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
373    if (lowering(DFREXP_DLDEXP_TO_ARITH))
374       dfrexp_exp_to_arith(frexp_exp);
375 
376    /* Extract the biased exponent from <x>. */
377    i.insert_before(extracted_biased_exp);
378    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
379 
380    i.insert_before(resulting_biased_exp);
381    i.insert_before(assign(resulting_biased_exp,
382                           add(extracted_biased_exp, exp)));
383 
384    /* Test if result is ±0.0, subnormal, or underflow by checking if the
385     * resulting biased exponent would be less than 0x1. If so, the result is
386     * 0.0 with the sign of x. (Actually, invert the conditions so that
387     * immediate values are the second arguments, which is better for i965)
388     * TODO: Implement in a vector fashion.
389     */
390    i.insert_before(zero_sign_x);
391    for (unsigned elem = 0; elem < vec_elem; elem++) {
392       ir_variable *unpacked =
393          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
394       i.insert_before(unpacked);
395       i.insert_before(
396             assign(unpacked,
397                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
398       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
399                              WRITEMASK_Y));
400       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
401       i.insert_before(assign(zero_sign_x,
402                              expr(ir_unop_pack_double_2x32, unpacked),
403                              1 << elem));
404    }
405    i.insert_before(is_not_zero_or_underflow);
406    i.insert_before(assign(is_not_zero_or_underflow,
407                           gequal(resulting_biased_exp,
408                                   new(ir) ir_constant(0x1, vec_elem))));
409    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
410                                   x, zero_sign_x)));
411    i.insert_before(assign(resulting_biased_exp,
412                           csel(is_not_zero_or_underflow,
413                                resulting_biased_exp, zeroi)));
414 
415    /* We could test for overflows by checking if the resulting biased exponent
416     * would be greater than 0xFE. Turns out we don't need to because the GLSL
417     * spec says:
418     *
419     *    "If this product is too large to be represented in the
420     *     floating-point type, the result is undefined."
421     */
422 
423    ir_rvalue *results[4] = {NULL};
424    for (unsigned elem = 0; elem < vec_elem; elem++) {
425       ir_variable *unpacked =
426          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
427       i.insert_before(unpacked);
428       i.insert_before(
429             assign(unpacked,
430                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
431 
432       ir_expression *bfi = bitfield_insert(
433             swizzle_y(unpacked),
434             i2u(swizzle(resulting_biased_exp, elem, 1)),
435             exp_shift->clone(ir, NULL),
436             exp_width->clone(ir, NULL));
437 
438       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
439 
440       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
441    }
442 
443    ir->operation = ir_quadop_vector;
444    ir->init_num_operands();
445    ir->operands[0] = results[0];
446    ir->operands[1] = results[1];
447    ir->operands[2] = results[2];
448    ir->operands[3] = results[3];
449 
450    /* Don't generate new IR that would need to be lowered in an additional
451     * pass.
452     */
453 
454    this->progress = true;
455 }
456 
457 void
dfrexp_sig_to_arith(ir_expression * ir)458 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
459 {
460    const unsigned vec_elem = ir->type->vector_elements;
461    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
462 
463    /* Double-precision floating-point values are stored as
464     *   1 sign bit;
465     *   11 exponent bits;
466     *   52 mantissa bits.
467     *
468     * We're just extracting the significand here, so we only need to modify
469     * the upper 32-bit uint. Unfortunately we must extract each double
470     * independently as there is no vector version of unpackDouble.
471     */
472 
473    ir_instruction &i = *base_ir;
474 
475    ir_variable *is_not_zero =
476       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
477    ir_rvalue *results[4] = {NULL};
478 
479    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
480    i.insert_before(is_not_zero);
481    i.insert_before(
482          assign(is_not_zero,
483                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
484 
485    /* TODO: Remake this as more vector-friendly when int64 support is
486     * available.
487     */
488    for (unsigned elem = 0; elem < vec_elem; elem++) {
489       ir_constant *zero = new(ir) ir_constant(0u, 1);
490       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
491 
492       /* Exponent of double floating-point values in the range [0.5, 1.0). */
493       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
494 
495       ir_variable *bits =
496          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
497       ir_variable *unpacked =
498          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
499 
500       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
501 
502       i.insert_before(bits);
503       i.insert_before(unpacked);
504       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
505 
506       /* Manipulate the high uint to remove the exponent and replace it with
507        * either the default exponent or zero.
508        */
509       i.insert_before(assign(bits, swizzle_y(unpacked)));
510       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
511       i.insert_before(assign(bits, bit_or(bits,
512                                           csel(swizzle(is_not_zero, elem, 1),
513                                                exponent_value,
514                                                zero))));
515       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
516       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
517    }
518 
519    /* Put the dvec back together */
520    ir->operation = ir_quadop_vector;
521    ir->init_num_operands();
522    ir->operands[0] = results[0];
523    ir->operands[1] = results[1];
524    ir->operands[2] = results[2];
525    ir->operands[3] = results[3];
526 
527    this->progress = true;
528 }
529 
530 void
dfrexp_exp_to_arith(ir_expression * ir)531 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
532 {
533    const unsigned vec_elem = ir->type->vector_elements;
534    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
535    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
536 
537    /* Double-precision floating-point values are stored as
538     *   1 sign bit;
539     *   11 exponent bits;
540     *   52 mantissa bits.
541     *
542     * We're just extracting the exponent here, so we only care about the upper
543     * 32-bit uint.
544     */
545 
546    ir_instruction &i = *base_ir;
547 
548    ir_variable *is_not_zero =
549       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
550    ir_variable *high_words =
551       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
552    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
553    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
554 
555    ir_rvalue *absval = abs(ir->operands[0]);
556 
557    i.insert_before(is_not_zero);
558    i.insert_before(high_words);
559    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
560 
561    /* Extract all of the upper uints. */
562    for (unsigned elem = 0; elem < vec_elem; elem++) {
563       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
564 
565       i.insert_before(assign(high_words,
566                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
567                              1 << elem));
568 
569    }
570    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
571    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
572 
573    /* For non-zero inputs, shift the exponent down and apply bias. */
574    ir->operation = ir_triop_csel;
575    ir->init_num_operands();
576    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
577    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
578    ir->operands[2] = izero;
579 
580    this->progress = true;
581 }
582 
583 void
carry_to_arith(ir_expression * ir)584 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
585 {
586    /* Translates
587     *   ir_binop_carry x y
588     * into
589     *   sum = ir_binop_add x y
590     *   bcarry = ir_binop_less sum x
591     *   carry = ir_unop_b2i bcarry
592     */
593 
594    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
595    ir->operation = ir_unop_i2u;
596    ir->init_num_operands();
597    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
598    ir->operands[1] = NULL;
599 
600    this->progress = true;
601 }
602 
603 void
borrow_to_arith(ir_expression * ir)604 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
605 {
606    /* Translates
607     *   ir_binop_borrow x y
608     * into
609     *   bcarry = ir_binop_less x y
610     *   carry = ir_unop_b2i bcarry
611     */
612 
613    ir->operation = ir_unop_i2u;
614    ir->init_num_operands();
615    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
616    ir->operands[1] = NULL;
617 
618    this->progress = true;
619 }
620 
621 void
double_dot_to_fma(ir_expression * ir)622 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
623 {
624    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
625 					   ir_var_temporary);
626    this->base_ir->insert_before(temp);
627 
628    int nc = ir->operands[0]->type->components();
629    for (int i = nc - 1; i >= 1; i--) {
630       ir_assignment *assig;
631       if (i == (nc - 1)) {
632          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
633                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
634       } else {
635          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
636                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
637                                   temp));
638       }
639       this->base_ir->insert_before(assig);
640    }
641 
642    ir->operation = ir_triop_fma;
643    ir->init_num_operands();
644    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
645    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
646    ir->operands[2] = new(ir) ir_dereference_variable(temp);
647 
648    this->progress = true;
649 
650 }
651 
652 void
double_lrp(ir_expression * ir)653 lower_instructions_visitor::double_lrp(ir_expression *ir)
654 {
655    int swizval;
656    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
657    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
658 
659    switch (op2->type->vector_elements) {
660    case 1:
661       swizval = SWIZZLE_XXXX;
662       break;
663    default:
664       assert(op0->type->vector_elements == op2->type->vector_elements);
665       swizval = SWIZZLE_XYZW;
666       break;
667    }
668 
669    ir->operation = ir_triop_fma;
670    ir->init_num_operands();
671    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
672    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
673 
674    this->progress = true;
675 }
676 
677 void
dceil_to_dfrac(ir_expression * ir)678 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
679 {
680    /*
681     * frtemp = frac(x);
682     * temp = sub(x, frtemp);
683     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
684     */
685    ir_instruction &i = *base_ir;
686    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
687    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
688    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
689                                              ir_var_temporary);
690 
691    i.insert_before(frtemp);
692    i.insert_before(assign(frtemp, fract(ir->operands[0])));
693 
694    ir->operation = ir_binop_add;
695    ir->init_num_operands();
696    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
697    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
698 
699    this->progress = true;
700 }
701 
702 void
dfloor_to_dfrac(ir_expression * ir)703 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
704 {
705    /*
706     * frtemp = frac(x);
707     * result = sub(x, frtemp);
708     */
709    ir->operation = ir_binop_sub;
710    ir->init_num_operands();
711    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
712 
713    this->progress = true;
714 }
715 void
dround_even_to_dfrac(ir_expression * ir)716 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
717 {
718    /*
719     * insane but works
720     * temp = x + 0.5;
721     * frtemp = frac(temp);
722     * t2 = sub(temp, frtemp);
723     * if (frac(x) == 0.5)
724     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
725     *  else
726     *     result = t2;
727 
728     */
729    ir_instruction &i = *base_ir;
730    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
731                                              ir_var_temporary);
732    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
733                                            ir_var_temporary);
734    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
735                                            ir_var_temporary);
736    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
737    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
738    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
739 
740    i.insert_before(temp);
741    i.insert_before(assign(temp, add(ir->operands[0], p5)));
742 
743    i.insert_before(frtemp);
744    i.insert_before(assign(frtemp, fract(temp)));
745 
746    i.insert_before(t2);
747    i.insert_before(assign(t2, sub(temp, frtemp)));
748 
749    ir->operation = ir_triop_csel;
750    ir->init_num_operands();
751    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
752                            p5->clone(ir, NULL));
753    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
754                                 zero),
755                           t2,
756                           sub(t2, one));
757    ir->operands[2] = new(ir) ir_dereference_variable(t2);
758 
759    this->progress = true;
760 }
761 
762 void
dtrunc_to_dfrac(ir_expression * ir)763 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
764 {
765    /*
766     * frtemp = frac(x);
767     * temp = sub(x, frtemp);
768     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
769     */
770    ir_rvalue *arg = ir->operands[0];
771    ir_instruction &i = *base_ir;
772 
773    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
774    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
775    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
776                                              ir_var_temporary);
777    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
778                                            ir_var_temporary);
779 
780    i.insert_before(frtemp);
781    i.insert_before(assign(frtemp, fract(arg)));
782    i.insert_before(temp);
783    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
784 
785    ir->operation = ir_triop_csel;
786    ir->init_num_operands();
787    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
788    ir->operands[1] = new (ir) ir_dereference_variable(temp);
789    ir->operands[2] = add(temp,
790                          csel(equal(frtemp, zero->clone(ir, NULL)),
791                               zero->clone(ir, NULL),
792                               one));
793 
794    this->progress = true;
795 }
796 
797 void
dsign_to_csel(ir_expression * ir)798 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
799 {
800    /*
801     * temp = x > 0.0 ? 1.0 : 0.0;
802     * result = x < 0.0 ? -1.0 : temp;
803     */
804    ir_rvalue *arg = ir->operands[0];
805    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
806    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
807    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
808 
809    ir->operation = ir_triop_csel;
810    ir->init_num_operands();
811    ir->operands[0] = less(arg->clone(ir, NULL),
812                           zero->clone(ir, NULL));
813    ir->operands[1] = neg_one;
814    ir->operands[2] = csel(greater(arg, zero),
815                           one,
816                           zero->clone(ir, NULL));
817 
818    this->progress = true;
819 }
820 
821 void
bit_count_to_math(ir_expression * ir)822 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
823 {
824    /* For more details, see:
825     *
826     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
827     */
828    const unsigned elements = ir->operands[0]->type->vector_elements;
829    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
830                                            ir_var_temporary);
831    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
832    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
833    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
834    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
835    ir_constant *c1 = new(ir) ir_constant(1u);
836    ir_constant *c2 = new(ir) ir_constant(2u);
837    ir_constant *c4 = new(ir) ir_constant(4u);
838    ir_constant *c24 = new(ir) ir_constant(24u);
839 
840    base_ir->insert_before(temp);
841 
842    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
843       base_ir->insert_before(assign(temp, ir->operands[0]));
844    } else {
845       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
846       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
847    }
848 
849    /* temp = temp - ((temp >> 1) & 0x55555555u); */
850    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
851                                                          c55555555))));
852 
853    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
854    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
855                                            bit_and(rshift(temp, c2),
856                                                    c33333333->clone(ir, NULL)))));
857 
858    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
859    ir->operation = ir_unop_u2i;
860    ir->init_num_operands();
861    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
862                                 c01010101),
863                             c24);
864 
865    this->progress = true;
866 }
867 
868 void
extract_to_shifts(ir_expression * ir)869 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
870 {
871    ir_variable *bits =
872       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
873 
874    base_ir->insert_before(bits);
875    base_ir->insert_before(assign(bits, ir->operands[2]));
876 
877    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
878       ir_constant *c1 =
879          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
880       ir_constant *c32 =
881          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
882       ir_constant *cFFFFFFFF =
883          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
884 
885       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
886        * we'd get a mask of 0 when bits is 32.  Special case it.
887        *
888        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
889        */
890       ir_expression *mask = csel(equal(bits, c32),
891                                  cFFFFFFFF,
892                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
893 
894       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
895        *
896        *    If bits is zero, the result will be zero.
897        *
898        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
899        * select as in the signed integer case.
900        *
901        * (value >> offset) & mask;
902        */
903       ir->operation = ir_binop_bit_and;
904       ir->init_num_operands();
905       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
906       ir->operands[1] = mask;
907       ir->operands[2] = NULL;
908    } else {
909       ir_constant *c0 =
910          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
911       ir_constant *c32 =
912          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
913       ir_variable *temp =
914          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
915 
916       /* temp = 32 - bits; */
917       base_ir->insert_before(temp);
918       base_ir->insert_before(assign(temp, sub(c32, bits)));
919 
920       /* expr = value << (temp - offset)) >> temp; */
921       ir_expression *expr =
922          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
923 
924       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
925        *
926        *    If bits is zero, the result will be zero.
927        *
928        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
929        * (32-0)) doesn't "erase" all of the data as we would like, so finish
930        * up with:
931        *
932        * (bits == 0) ? 0 : e;
933        */
934       ir->operation = ir_triop_csel;
935       ir->init_num_operands();
936       ir->operands[0] = equal(c0, bits);
937       ir->operands[1] = c0->clone(ir, NULL);
938       ir->operands[2] = expr;
939    }
940 
941    this->progress = true;
942 }
943 
944 void
insert_to_shifts(ir_expression * ir)945 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
946 {
947    ir_constant *c1;
948    ir_constant *c32;
949    ir_constant *cFFFFFFFF;
950    ir_variable *offset =
951       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
952    ir_variable *bits =
953       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
954    ir_variable *mask =
955       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
956 
957    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
958       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
959       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
960       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
961    } else {
962       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
963 
964       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
965       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
966       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
967    }
968 
969    base_ir->insert_before(offset);
970    base_ir->insert_before(assign(offset, ir->operands[2]));
971 
972    base_ir->insert_before(bits);
973    base_ir->insert_before(assign(bits, ir->operands[3]));
974 
975    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
976     * we'd get a mask of 0 when bits is 32.  Special case it.
977     *
978     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
979     *
980     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
981     *
982     *    The result will be undefined if offset or bits is negative, or if the
983     *    sum of offset and bits is greater than the number of bits used to
984     *    store the operand.
985     *
986     * Since it's undefined, there are a couple other ways this could be
987     * implemented.  The other way that was considered was to put the csel
988     * around the whole thing:
989     *
990     *    final_result = bits == 32 ? insert : ... ;
991     */
992    base_ir->insert_before(mask);
993 
994    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
995                                             cFFFFFFFF,
996                                             lshift(sub(lshift(c1, bits),
997                                                        c1->clone(ir, NULL)),
998                                                    offset))));
999 
1000    /* (base & ~mask) | ((insert << offset) & mask) */
1001    ir->operation = ir_binop_bit_or;
1002    ir->init_num_operands();
1003    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1004    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1005    ir->operands[2] = NULL;
1006    ir->operands[3] = NULL;
1007 
1008    this->progress = true;
1009 }
1010 
1011 void
reverse_to_shifts(ir_expression * ir)1012 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1013 {
1014    /* For more details, see:
1015     *
1016     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1017     */
1018    ir_constant *c1 =
1019       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1020    ir_constant *c2 =
1021       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1022    ir_constant *c4 =
1023       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1024    ir_constant *c8 =
1025       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1026    ir_constant *c16 =
1027       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1028    ir_constant *c33333333 =
1029       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1030    ir_constant *c55555555 =
1031       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1032    ir_constant *c0F0F0F0F =
1033       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1034    ir_constant *c00FF00FF =
1035       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1036    ir_variable *temp =
1037       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1038                           "temp", ir_var_temporary);
1039    ir_instruction &i = *base_ir;
1040 
1041    i.insert_before(temp);
1042 
1043    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1044       i.insert_before(assign(temp, ir->operands[0]));
1045    } else {
1046       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1047       i.insert_before(assign(temp, i2u(ir->operands[0])));
1048    }
1049 
1050    /* Swap odd and even bits.
1051     *
1052     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1053     */
1054    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1055                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1056                                               c1->clone(ir, NULL)))));
1057    /* Swap consecutive pairs.
1058     *
1059     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1060     */
1061    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1062                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1063                                               c2->clone(ir, NULL)))));
1064 
1065    /* Swap nibbles.
1066     *
1067     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1068     */
1069    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1070                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1071                                               c4->clone(ir, NULL)))));
1072 
1073    /* The last step is, basically, bswap.  Swap the bytes, then swap the
1074     * words.  When this code is run through GCC on x86, it does generate a
1075     * bswap instruction.
1076     *
1077     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1078     * temp = ( temp >> 16              ) | ( temp                << 16);
1079     */
1080    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1081                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1082                                               c8->clone(ir, NULL)))));
1083 
1084    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1085       ir->operation = ir_binop_bit_or;
1086       ir->init_num_operands();
1087       ir->operands[0] = rshift(temp, c16);
1088       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1089    } else {
1090       ir->operation = ir_unop_u2i;
1091       ir->init_num_operands();
1092       ir->operands[0] = bit_or(rshift(temp, c16),
1093                                lshift(temp, c16->clone(ir, NULL)));
1094    }
1095 
1096    this->progress = true;
1097 }
1098 
1099 void
find_lsb_to_float_cast(ir_expression * ir)1100 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1101 {
1102    /* For more details, see:
1103     *
1104     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1105     */
1106    const unsigned elements = ir->operands[0]->type->vector_elements;
1107    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1108    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1109    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1110    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1111    ir_variable *temp =
1112       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1113    ir_variable *lsb_only =
1114       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1115    ir_variable *as_float =
1116       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1117    ir_variable *lsb =
1118       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1119 
1120    ir_instruction &i = *base_ir;
1121 
1122    i.insert_before(temp);
1123 
1124    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1125       i.insert_before(assign(temp, ir->operands[0]));
1126    } else {
1127       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1128       i.insert_before(assign(temp, u2i(ir->operands[0])));
1129    }
1130 
1131    /* The int-to-float conversion is lossless because (value & -value) is
1132     * either a power of two or zero.  We don't use the result in the zero
1133     * case.  The uint() cast is necessary so that 0x80000000 does not
1134     * generate a negative value.
1135     *
1136     * uint lsb_only = uint(value & -value);
1137     * float as_float = float(lsb_only);
1138     */
1139    i.insert_before(lsb_only);
1140    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1141 
1142    i.insert_before(as_float);
1143    i.insert_before(assign(as_float, u2f(lsb_only)));
1144 
1145    /* This is basically an open-coded frexp.  Implementations that have a
1146     * native frexp instruction would be better served by that.  This is
1147     * optimized versus a full-featured open-coded implementation in two ways:
1148     *
1149     * - We don't care about a correct result from subnormal numbers (including
1150     *   0.0), so the raw exponent can always be safely unbiased.
1151     *
1152     * - The value cannot be negative, so it does not need to be masked off to
1153     *   extract the exponent.
1154     *
1155     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1156     */
1157    i.insert_before(lsb);
1158    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1159 
1160    /* Use lsb_only in the comparison instead of temp so that the & (far above)
1161     * can possibly generate the result without an explicit comparison.
1162     *
1163     * (lsb_only == 0) ? -1 : lsb;
1164     *
1165     * Since our input values are all integers, the unbiased exponent must not
1166     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
1167     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
1168     * better is likely GPU dependent.  Either way, the difference should be
1169     * small.
1170     */
1171    ir->operation = ir_triop_csel;
1172    ir->init_num_operands();
1173    ir->operands[0] = equal(lsb_only, c0);
1174    ir->operands[1] = cminus1;
1175    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1176 
1177    this->progress = true;
1178 }
1179 
1180 void
find_msb_to_float_cast(ir_expression * ir)1181 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
1182 {
1183    /* For more details, see:
1184     *
1185     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1186     */
1187    const unsigned elements = ir->operands[0]->type->vector_elements;
1188    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1189    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1190    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1191    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1192    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
1193    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
1194    ir_variable *temp =
1195       new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
1196    ir_variable *as_float =
1197       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1198    ir_variable *msb =
1199       new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
1200 
1201    ir_instruction &i = *base_ir;
1202 
1203    i.insert_before(temp);
1204 
1205    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1206       i.insert_before(assign(temp, ir->operands[0]));
1207    } else {
1208       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1209 
1210       /* findMSB(uint(abs(some_int))) almost always does the right thing.
1211        * There are two problem values:
1212        *
1213        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
1214        *   31.  However, findMSB(int(0x80000000)) == 30.
1215        *
1216        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
1217        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1218        *
1219        *    For a value of zero or negative one, -1 will be returned.
1220        *
1221        * For all negative number cases, including 0x80000000 and 0xffffffff,
1222        * the correct value is obtained from findMSB if instead of negating the
1223        * (already negative) value the logical-not is used.  A conditonal
1224        * logical-not can be achieved in two instructions.
1225        */
1226       ir_variable *as_int =
1227          new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
1228       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
1229 
1230       i.insert_before(as_int);
1231       i.insert_before(assign(as_int, ir->operands[0]));
1232       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
1233                                             as_int,
1234                                             rshift(as_int, c31)))));
1235    }
1236 
1237    /* The int-to-float conversion is lossless because bits are conditionally
1238     * masked off the bottom of temp to ensure the value has at most 24 bits of
1239     * data or is zero.  We don't use the result in the zero case.  The uint()
1240     * cast is necessary so that 0x80000000 does not generate a negative value.
1241     *
1242     * float as_float = float(temp > 255 ? temp & ~255 : temp);
1243     */
1244    i.insert_before(as_float);
1245    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
1246                                              bit_and(temp, cFFFFFF00),
1247                                              temp))));
1248 
1249    /* This is basically an open-coded frexp.  Implementations that have a
1250     * native frexp instruction would be better served by that.  This is
1251     * optimized versus a full-featured open-coded implementation in two ways:
1252     *
1253     * - We don't care about a correct result from subnormal numbers (including
1254     *   0.0), so the raw exponent can always be safely unbiased.
1255     *
1256     * - The value cannot be negative, so it does not need to be masked off to
1257     *   extract the exponent.
1258     *
1259     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1260     */
1261    i.insert_before(msb);
1262    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1263 
1264    /* Use msb in the comparison instead of temp so that the subtract can
1265     * possibly generate the result without an explicit comparison.
1266     *
1267     * (msb < 0) ? -1 : msb;
1268     *
1269     * Since our input values are all integers, the unbiased exponent must not
1270     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
1271     */
1272    ir->operation = ir_triop_csel;
1273    ir->init_num_operands();
1274    ir->operands[0] = less(msb, c0);
1275    ir->operands[1] = cminus1;
1276    ir->operands[2] = new(ir) ir_dereference_variable(msb);
1277 
1278    this->progress = true;
1279 }
1280 
1281 ir_expression *
_carry(operand a,operand b)1282 lower_instructions_visitor::_carry(operand a, operand b)
1283 {
1284    if (lowering(CARRY_TO_ARITH))
1285       return i2u(b2i(less(add(a, b),
1286                           a.val->clone(ralloc_parent(a.val), NULL))));
1287    else
1288       return carry(a, b);
1289 }
1290 
1291 void
imul_high_to_mul(ir_expression * ir)1292 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
1293 {
1294    /*   ABCD
1295     * * EFGH
1296     * ======
1297     * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
1298     *
1299     * In GLSL, (a * b) becomes
1300     *
1301     * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
1302     * uint m2 = (a & 0x0000ffffu) * (b >> 16);
1303     * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
1304     * uint m4 = (a >> 16)         * (b >> 16);
1305     *
1306     * uint c1;
1307     * uint c2;
1308     * uint lo_result;
1309     * uint hi_result;
1310     *
1311     * lo_result = uaddCarry(m1, m2 << 16, c1);
1312     * hi_result = m4 + c1;
1313     * lo_result = uaddCarry(lo_result, m3 << 16, c2);
1314     * hi_result = hi_result + c2;
1315     * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
1316     */
1317    const unsigned elements = ir->operands[0]->type->vector_elements;
1318    ir_variable *src1 =
1319       new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary);
1320    ir_variable *src1h =
1321       new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary);
1322    ir_variable *src1l =
1323       new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary);
1324    ir_variable *src2 =
1325       new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary);
1326    ir_variable *src2h =
1327       new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary);
1328    ir_variable *src2l =
1329       new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary);
1330    ir_variable *t1 =
1331       new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary);
1332    ir_variable *t2 =
1333       new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary);
1334    ir_variable *lo =
1335       new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary);
1336    ir_variable *hi =
1337       new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary);
1338    ir_variable *different_signs = NULL;
1339    ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
1340    ir_constant *c16 = new(ir) ir_constant(16u, elements);
1341 
1342    ir_instruction &i = *base_ir;
1343 
1344    i.insert_before(src1);
1345    i.insert_before(src2);
1346    i.insert_before(src1h);
1347    i.insert_before(src2h);
1348    i.insert_before(src1l);
1349    i.insert_before(src2l);
1350 
1351    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1352       i.insert_before(assign(src1, ir->operands[0]));
1353       i.insert_before(assign(src2, ir->operands[1]));
1354    } else {
1355       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1356 
1357       ir_variable *itmp1 =
1358          new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary);
1359       ir_variable *itmp2 =
1360          new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary);
1361       ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1362 
1363       i.insert_before(itmp1);
1364       i.insert_before(itmp2);
1365       i.insert_before(assign(itmp1, ir->operands[0]));
1366       i.insert_before(assign(itmp2, ir->operands[1]));
1367 
1368       different_signs =
1369          new(ir) ir_variable(glsl_type::bvec(elements), "different_signs",
1370                              ir_var_temporary);
1371 
1372       i.insert_before(different_signs);
1373       i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
1374                                                    less(itmp1, c0),
1375                                                    less(itmp2, c0->clone(ir, NULL)))));
1376 
1377       i.insert_before(assign(src1, i2u(abs(itmp1))));
1378       i.insert_before(assign(src2, i2u(abs(itmp2))));
1379    }
1380 
1381    i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
1382    i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
1383    i.insert_before(assign(src1h, rshift(src1, c16)));
1384    i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
1385 
1386    i.insert_before(lo);
1387    i.insert_before(hi);
1388    i.insert_before(t1);
1389    i.insert_before(t2);
1390 
1391    i.insert_before(assign(lo, mul(src1l, src2l)));
1392    i.insert_before(assign(t1, mul(src1l, src2h)));
1393    i.insert_before(assign(t2, mul(src1h, src2l)));
1394    i.insert_before(assign(hi, mul(src1h, src2h)));
1395 
1396    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
1397    i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
1398 
1399    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
1400    i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
1401 
1402    if (different_signs == NULL) {
1403       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1404 
1405       ir->operation = ir_binop_add;
1406       ir->init_num_operands();
1407       ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
1408       ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
1409    } else {
1410       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1411 
1412       i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
1413                                      rshift(t2, c16->clone(ir, NULL)))));
1414 
1415       /* For channels where different_signs is set we have to perform a 64-bit
1416        * negation.  This is *not* the same as just negating the high 32-bits.
1417        * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
1418        * -1, not -0!  Recall -x == ~x + 1.
1419        */
1420       ir_variable *neg_hi =
1421          new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary);
1422       ir_constant *c1 = new(ir) ir_constant(1u, elements);
1423 
1424       i.insert_before(neg_hi);
1425       i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
1426                                          u2i(_carry(bit_not(lo), c1)))));
1427 
1428       ir->operation = ir_triop_csel;
1429       ir->init_num_operands();
1430       ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
1431       ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
1432       ir->operands[2] = u2i(hi);
1433    }
1434 }
1435 
1436 void
sqrt_to_abs_sqrt(ir_expression * ir)1437 lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
1438 {
1439    ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]);
1440    this->progress = true;
1441 }
1442 
1443 ir_visitor_status
visit_leave(ir_expression * ir)1444 lower_instructions_visitor::visit_leave(ir_expression *ir)
1445 {
1446    switch (ir->operation) {
1447    case ir_binop_dot:
1448       if (ir->operands[0]->type->is_double())
1449          double_dot_to_fma(ir);
1450       break;
1451    case ir_triop_lrp:
1452       if (ir->operands[0]->type->is_double())
1453          double_lrp(ir);
1454       break;
1455    case ir_binop_sub:
1456       if (lowering(SUB_TO_ADD_NEG))
1457 	 sub_to_add_neg(ir);
1458       break;
1459 
1460    case ir_binop_ldexp:
1461       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1462          ldexp_to_arith(ir);
1463       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1464          dldexp_to_arith(ir);
1465       break;
1466 
1467    case ir_unop_frexp_exp:
1468       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1469          dfrexp_exp_to_arith(ir);
1470       break;
1471 
1472    case ir_unop_frexp_sig:
1473       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1474          dfrexp_sig_to_arith(ir);
1475       break;
1476 
1477    case ir_binop_carry:
1478       if (lowering(CARRY_TO_ARITH))
1479          carry_to_arith(ir);
1480       break;
1481 
1482    case ir_binop_borrow:
1483       if (lowering(BORROW_TO_ARITH))
1484          borrow_to_arith(ir);
1485       break;
1486 
1487    case ir_unop_trunc:
1488       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1489          dtrunc_to_dfrac(ir);
1490       break;
1491 
1492    case ir_unop_ceil:
1493       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1494          dceil_to_dfrac(ir);
1495       break;
1496 
1497    case ir_unop_floor:
1498       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1499          dfloor_to_dfrac(ir);
1500       break;
1501 
1502    case ir_unop_round_even:
1503       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1504          dround_even_to_dfrac(ir);
1505       break;
1506 
1507    case ir_unop_sign:
1508       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1509          dsign_to_csel(ir);
1510       break;
1511 
1512    case ir_unop_bit_count:
1513       if (lowering(BIT_COUNT_TO_MATH))
1514          bit_count_to_math(ir);
1515       break;
1516 
1517    case ir_triop_bitfield_extract:
1518       if (lowering(EXTRACT_TO_SHIFTS))
1519          extract_to_shifts(ir);
1520       break;
1521 
1522    case ir_quadop_bitfield_insert:
1523       if (lowering(INSERT_TO_SHIFTS))
1524          insert_to_shifts(ir);
1525       break;
1526 
1527    case ir_unop_bitfield_reverse:
1528       if (lowering(REVERSE_TO_SHIFTS))
1529          reverse_to_shifts(ir);
1530       break;
1531 
1532    case ir_unop_find_lsb:
1533       if (lowering(FIND_LSB_TO_FLOAT_CAST))
1534          find_lsb_to_float_cast(ir);
1535       break;
1536 
1537    case ir_unop_find_msb:
1538       if (lowering(FIND_MSB_TO_FLOAT_CAST))
1539          find_msb_to_float_cast(ir);
1540       break;
1541 
1542    case ir_binop_imul_high:
1543       if (lowering(IMUL_HIGH_TO_MUL))
1544          imul_high_to_mul(ir);
1545       break;
1546 
1547    case ir_unop_rsq:
1548    case ir_unop_sqrt:
1549       if (lowering(SQRT_TO_ABS_SQRT))
1550          sqrt_to_abs_sqrt(ir);
1551       break;
1552 
1553    default:
1554       return visit_continue;
1555    }
1556 
1557    return visit_continue;
1558 }
1559