• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /**
25  * \file lower_instructions.cpp
26  *
27  * Many GPUs lack native instructions for certain expression operations, and
28  * must replace them with some other expression tree.  This pass lowers some
29  * of the most common cases, allowing the lowering code to be implemented once
30  * rather than in each driver backend.
31  */
32 
33 #include "program/prog_instruction.h" /* for swizzle */
34 #include "compiler/glsl_types.h"
35 #include "ir.h"
36 #include "ir_builder.h"
37 #include "ir_optimization.h"
38 #include "util/half_float.h"
39 
40 #include <math.h>
41 
42 /* Operations for lower_instructions() */
43 #define FIND_LSB_TO_FLOAT_CAST    0x20000
44 #define FIND_MSB_TO_FLOAT_CAST    0x40000
45 #define IMUL_HIGH_TO_MUL          0x80000
46 #define SQRT_TO_ABS_SQRT          0x200000
47 
48 using namespace ir_builder;
49 
50 namespace {
51 
52 class lower_instructions_visitor : public ir_hierarchical_visitor {
53 public:
lower_instructions_visitor(unsigned lower)54    lower_instructions_visitor(unsigned lower)
55       : progress(false), lower(lower) { }
56 
57    ir_visitor_status visit_leave(ir_expression *);
58 
59    bool progress;
60 
61 private:
62    unsigned lower; /** Bitfield of which operations to lower */
63 
64    void double_dot_to_fma(ir_expression *);
65    void double_lrp(ir_expression *);
66    void find_lsb_to_float_cast(ir_expression *ir);
67    void find_msb_to_float_cast(ir_expression *ir);
68    void imul_high_to_mul(ir_expression *ir);
69    void sqrt_to_abs_sqrt(ir_expression *ir);
70 
71    ir_expression *_carry(operand a, operand b);
72 
73    static ir_constant *_imm_fp(void *mem_ctx,
74                                const glsl_type *type,
75                                double f,
76                                unsigned vector_elements=1);
77 };
78 
79 } /* anonymous namespace */
80 
81 /**
82  * Determine if a particular type of lowering should occur
83  */
84 #define lowering(x) (this->lower & x)
85 
86 bool
lower_instructions(exec_list * instructions,bool force_abs_sqrt,bool have_gpu_shader5)87 lower_instructions(exec_list *instructions, bool force_abs_sqrt,
88                    bool have_gpu_shader5)
89 {
90    unsigned what_to_lower =
91       (force_abs_sqrt ? SQRT_TO_ABS_SQRT : 0) |
92       /* Assume that if ARB_gpu_shader5 is not supported then all of the
93        * extended integer functions need lowering.  It may be necessary to add
94        * some caps for individual instructions.
95        */
96       (!have_gpu_shader5 ? FIND_LSB_TO_FLOAT_CAST |
97                            FIND_MSB_TO_FLOAT_CAST |
98                            IMUL_HIGH_TO_MUL : 0);
99 
100    lower_instructions_visitor v(what_to_lower);
101 
102    visit_list_elements(&v, instructions);
103    return v.progress;
104 }
105 
106 void
double_dot_to_fma(ir_expression * ir)107 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
108 {
109    ir_variable *temp = new(ir) ir_variable(glsl_get_base_glsl_type(ir->operands[0]->type), "dot_res",
110 					   ir_var_temporary);
111    this->base_ir->insert_before(temp);
112 
113    int nc = glsl_get_components(ir->operands[0]->type);
114    for (int i = nc - 1; i >= 1; i--) {
115       ir_assignment *assig;
116       if (i == (nc - 1)) {
117          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
118                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
119       } else {
120          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
121                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
122                                   temp));
123       }
124       this->base_ir->insert_before(assig);
125    }
126 
127    ir->operation = ir_triop_fma;
128    ir->init_num_operands();
129    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
130    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
131    ir->operands[2] = new(ir) ir_dereference_variable(temp);
132 
133    this->progress = true;
134 
135 }
136 
137 void
double_lrp(ir_expression * ir)138 lower_instructions_visitor::double_lrp(ir_expression *ir)
139 {
140    int swizval;
141    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
142    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
143 
144    switch (op2->type->vector_elements) {
145    case 1:
146       swizval = SWIZZLE_XXXX;
147       break;
148    default:
149       assert(op0->type->vector_elements == op2->type->vector_elements);
150       swizval = SWIZZLE_XYZW;
151       break;
152    }
153 
154    ir->operation = ir_triop_fma;
155    ir->init_num_operands();
156    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
157    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
158 
159    this->progress = true;
160 }
161 
162 void
find_lsb_to_float_cast(ir_expression * ir)163 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
164 {
165    /* For more details, see:
166     *
167     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
168     */
169    const unsigned elements = ir->operands[0]->type->vector_elements;
170    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
171    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
172    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
173    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
174    ir_variable *temp =
175       new(ir) ir_variable(glsl_ivec_type(elements), "temp", ir_var_temporary);
176    ir_variable *lsb_only =
177       new(ir) ir_variable(glsl_uvec_type(elements), "lsb_only", ir_var_temporary);
178    ir_variable *as_float =
179       new(ir) ir_variable(glsl_vec_type(elements), "as_float", ir_var_temporary);
180    ir_variable *lsb =
181       new(ir) ir_variable(glsl_ivec_type(elements), "lsb", ir_var_temporary);
182 
183    ir_instruction &i = *base_ir;
184 
185    i.insert_before(temp);
186 
187    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
188       i.insert_before(assign(temp, ir->operands[0]));
189    } else {
190       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
191       i.insert_before(assign(temp, u2i(ir->operands[0])));
192    }
193 
194    /* The int-to-float conversion is lossless because (value & -value) is
195     * either a power of two or zero.  We don't use the result in the zero
196     * case.  The uint() cast is necessary so that 0x80000000 does not
197     * generate a negative value.
198     *
199     * uint lsb_only = uint(value & -value);
200     * float as_float = float(lsb_only);
201     */
202    i.insert_before(lsb_only);
203    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
204 
205    i.insert_before(as_float);
206    i.insert_before(assign(as_float, u2f(lsb_only)));
207 
208    /* This is basically an open-coded frexp.  Implementations that have a
209     * native frexp instruction would be better served by that.  This is
210     * optimized versus a full-featured open-coded implementation in two ways:
211     *
212     * - We don't care about a correct result from subnormal numbers (including
213     *   0.0), so the raw exponent can always be safely unbiased.
214     *
215     * - The value cannot be negative, so it does not need to be masked off to
216     *   extract the exponent.
217     *
218     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
219     */
220    i.insert_before(lsb);
221    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
222 
223    /* Use lsb_only in the comparison instead of temp so that the & (far above)
224     * can possibly generate the result without an explicit comparison.
225     *
226     * (lsb_only == 0) ? -1 : lsb;
227     *
228     * Since our input values are all integers, the unbiased exponent must not
229     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
230     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
231     * better is likely GPU dependent.  Either way, the difference should be
232     * small.
233     */
234    ir->operation = ir_triop_csel;
235    ir->init_num_operands();
236    ir->operands[0] = equal(lsb_only, c0);
237    ir->operands[1] = cminus1;
238    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
239 
240    this->progress = true;
241 }
242 
243 void
find_msb_to_float_cast(ir_expression * ir)244 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
245 {
246    /* For more details, see:
247     *
248     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
249     */
250    const unsigned elements = ir->operands[0]->type->vector_elements;
251    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
252    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
253    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
254    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
255    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
256    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
257    ir_variable *temp =
258       new(ir) ir_variable(glsl_uvec_type(elements), "temp", ir_var_temporary);
259    ir_variable *as_float =
260       new(ir) ir_variable(glsl_vec_type(elements), "as_float", ir_var_temporary);
261    ir_variable *msb =
262       new(ir) ir_variable(glsl_ivec_type(elements), "msb", ir_var_temporary);
263 
264    ir_instruction &i = *base_ir;
265 
266    i.insert_before(temp);
267 
268    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
269       i.insert_before(assign(temp, ir->operands[0]));
270    } else {
271       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
272 
273       /* findMSB(uint(abs(some_int))) almost always does the right thing.
274        * There are two problem values:
275        *
276        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
277        *   31.  However, findMSB(int(0x80000000)) == 30.
278        *
279        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
280        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
281        *
282        *    For a value of zero or negative one, -1 will be returned.
283        *
284        * For all negative number cases, including 0x80000000 and 0xffffffff,
285        * the correct value is obtained from findMSB if instead of negating the
286        * (already negative) value the logical-not is used.  A conditonal
287        * logical-not can be achieved in two instructions.
288        */
289       ir_variable *as_int =
290          new(ir) ir_variable(glsl_ivec_type(elements), "as_int", ir_var_temporary);
291       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
292 
293       i.insert_before(as_int);
294       i.insert_before(assign(as_int, ir->operands[0]));
295       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
296                                             as_int,
297                                             rshift(as_int, c31)))));
298    }
299 
300    /* The int-to-float conversion is lossless because bits are conditionally
301     * masked off the bottom of temp to ensure the value has at most 24 bits of
302     * data or is zero.  We don't use the result in the zero case.  The uint()
303     * cast is necessary so that 0x80000000 does not generate a negative value.
304     *
305     * float as_float = float(temp > 255 ? temp & ~255 : temp);
306     */
307    i.insert_before(as_float);
308    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
309                                              bit_and(temp, cFFFFFF00),
310                                              temp))));
311 
312    /* This is basically an open-coded frexp.  Implementations that have a
313     * native frexp instruction would be better served by that.  This is
314     * optimized versus a full-featured open-coded implementation in two ways:
315     *
316     * - We don't care about a correct result from subnormal numbers (including
317     *   0.0), so the raw exponent can always be safely unbiased.
318     *
319     * - The value cannot be negative, so it does not need to be masked off to
320     *   extract the exponent.
321     *
322     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
323     */
324    i.insert_before(msb);
325    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
326 
327    /* Use msb in the comparison instead of temp so that the subtract can
328     * possibly generate the result without an explicit comparison.
329     *
330     * (msb < 0) ? -1 : msb;
331     *
332     * Since our input values are all integers, the unbiased exponent must not
333     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
334     */
335    ir->operation = ir_triop_csel;
336    ir->init_num_operands();
337    ir->operands[0] = less(msb, c0);
338    ir->operands[1] = cminus1;
339    ir->operands[2] = new(ir) ir_dereference_variable(msb);
340 
341    this->progress = true;
342 }
343 
344 ir_expression *
_carry(operand a,operand b)345 lower_instructions_visitor::_carry(operand a, operand b)
346 {
347    return i2u(b2i(less(add(a, b),
348                        a.val->clone(ralloc_parent(a.val), NULL))));
349 }
350 
351 void
imul_high_to_mul(ir_expression * ir)352 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
353 {
354    /*   ABCD
355     * * EFGH
356     * ======
357     * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
358     *
359     * In GLSL, (a * b) becomes
360     *
361     * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
362     * uint m2 = (a & 0x0000ffffu) * (b >> 16);
363     * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
364     * uint m4 = (a >> 16)         * (b >> 16);
365     *
366     * uint c1;
367     * uint c2;
368     * uint lo_result;
369     * uint hi_result;
370     *
371     * lo_result = uaddCarry(m1, m2 << 16, c1);
372     * hi_result = m4 + c1;
373     * lo_result = uaddCarry(lo_result, m3 << 16, c2);
374     * hi_result = hi_result + c2;
375     * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
376     */
377    const unsigned elements = ir->operands[0]->type->vector_elements;
378    ir_variable *src1 =
379       new(ir) ir_variable(glsl_uvec_type(elements), "src1", ir_var_temporary);
380    ir_variable *src1h =
381       new(ir) ir_variable(glsl_uvec_type(elements), "src1h", ir_var_temporary);
382    ir_variable *src1l =
383       new(ir) ir_variable(glsl_uvec_type(elements), "src1l", ir_var_temporary);
384    ir_variable *src2 =
385       new(ir) ir_variable(glsl_uvec_type(elements), "src2", ir_var_temporary);
386    ir_variable *src2h =
387       new(ir) ir_variable(glsl_uvec_type(elements), "src2h", ir_var_temporary);
388    ir_variable *src2l =
389       new(ir) ir_variable(glsl_uvec_type(elements), "src2l", ir_var_temporary);
390    ir_variable *t1 =
391       new(ir) ir_variable(glsl_uvec_type(elements), "t1", ir_var_temporary);
392    ir_variable *t2 =
393       new(ir) ir_variable(glsl_uvec_type(elements), "t2", ir_var_temporary);
394    ir_variable *lo =
395       new(ir) ir_variable(glsl_uvec_type(elements), "lo", ir_var_temporary);
396    ir_variable *hi =
397       new(ir) ir_variable(glsl_uvec_type(elements), "hi", ir_var_temporary);
398    ir_variable *different_signs = NULL;
399    ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
400    ir_constant *c16 = new(ir) ir_constant(16u, elements);
401 
402    ir_instruction &i = *base_ir;
403 
404    i.insert_before(src1);
405    i.insert_before(src2);
406    i.insert_before(src1h);
407    i.insert_before(src2h);
408    i.insert_before(src1l);
409    i.insert_before(src2l);
410 
411    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
412       i.insert_before(assign(src1, ir->operands[0]));
413       i.insert_before(assign(src2, ir->operands[1]));
414    } else {
415       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
416 
417       ir_variable *itmp1 =
418          new(ir) ir_variable(glsl_ivec_type(elements), "itmp1", ir_var_temporary);
419       ir_variable *itmp2 =
420          new(ir) ir_variable(glsl_ivec_type(elements), "itmp2", ir_var_temporary);
421       ir_constant *c0 = new(ir) ir_constant(int(0), elements);
422 
423       i.insert_before(itmp1);
424       i.insert_before(itmp2);
425       i.insert_before(assign(itmp1, ir->operands[0]));
426       i.insert_before(assign(itmp2, ir->operands[1]));
427 
428       different_signs =
429          new(ir) ir_variable(glsl_bvec_type(elements), "different_signs",
430                              ir_var_temporary);
431 
432       i.insert_before(different_signs);
433       i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
434                                                    less(itmp1, c0),
435                                                    less(itmp2, c0->clone(ir, NULL)))));
436 
437       i.insert_before(assign(src1, i2u(abs(itmp1))));
438       i.insert_before(assign(src2, i2u(abs(itmp2))));
439    }
440 
441    i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
442    i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
443    i.insert_before(assign(src1h, rshift(src1, c16)));
444    i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
445 
446    i.insert_before(lo);
447    i.insert_before(hi);
448    i.insert_before(t1);
449    i.insert_before(t2);
450 
451    i.insert_before(assign(lo, mul(src1l, src2l)));
452    i.insert_before(assign(t1, mul(src1l, src2h)));
453    i.insert_before(assign(t2, mul(src1h, src2l)));
454    i.insert_before(assign(hi, mul(src1h, src2h)));
455 
456    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
457    i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
458 
459    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
460    i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
461 
462    if (different_signs == NULL) {
463       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
464 
465       ir->operation = ir_binop_add;
466       ir->init_num_operands();
467       ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
468       ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
469    } else {
470       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
471 
472       i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
473                                      rshift(t2, c16->clone(ir, NULL)))));
474 
475       /* For channels where different_signs is set we have to perform a 64-bit
476        * negation.  This is *not* the same as just negating the high 32-bits.
477        * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
478        * -1, not -0!  Recall -x == ~x + 1.
479        */
480       ir_variable *neg_hi =
481          new(ir) ir_variable(glsl_ivec_type(elements), "neg_hi", ir_var_temporary);
482       ir_constant *c1 = new(ir) ir_constant(1u, elements);
483 
484       i.insert_before(neg_hi);
485       i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
486                                          u2i(_carry(bit_not(lo), c1)))));
487 
488       ir->operation = ir_triop_csel;
489       ir->init_num_operands();
490       ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
491       ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
492       ir->operands[2] = u2i(hi);
493    }
494 }
495 
496 void
sqrt_to_abs_sqrt(ir_expression * ir)497 lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
498 {
499    ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]);
500    this->progress = true;
501 }
502 
503 ir_visitor_status
visit_leave(ir_expression * ir)504 lower_instructions_visitor::visit_leave(ir_expression *ir)
505 {
506    switch (ir->operation) {
507    case ir_binop_dot:
508       if (glsl_type_is_double(ir->operands[0]->type))
509          double_dot_to_fma(ir);
510       break;
511    case ir_triop_lrp:
512       if (glsl_type_is_double(ir->operands[0]->type))
513          double_lrp(ir);
514       break;
515 
516    case ir_unop_find_lsb:
517       if (lowering(FIND_LSB_TO_FLOAT_CAST))
518          find_lsb_to_float_cast(ir);
519       break;
520 
521    case ir_unop_find_msb:
522       if (lowering(FIND_MSB_TO_FLOAT_CAST))
523          find_msb_to_float_cast(ir);
524       break;
525 
526    case ir_binop_imul_high:
527       if (lowering(IMUL_HIGH_TO_MUL))
528          imul_high_to_mul(ir);
529       break;
530 
531    case ir_unop_rsq:
532    case ir_unop_sqrt:
533       if (lowering(SQRT_TO_ABS_SQRT))
534          sqrt_to_abs_sqrt(ir);
535       break;
536 
537    default:
538       return visit_continue;
539    }
540 
541    return visit_continue;
542 }
543