• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Red Hat
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "nir.h"
28 #include "nir_builder.h"
29 
30 /* Has two paths
31  * One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on
32  * NV50LegalizeSSA::handleDIV()
33  *
34  * Note that this path probably does not have not enough precision for
35  * compute shaders. Perhaps we want a second higher precision (looping)
36  * version of this? Or perhaps we assume if you can do compute shaders you
37  * can also branch out to a pre-optimized shader library routine..
38  *
39  * The other path (nir_lower_idiv_precise) is based off of code used by LLVM's
40  * AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly.
41  */
42 
43 static bool
convert_instr(nir_builder * bld,nir_alu_instr * alu)44 convert_instr(nir_builder *bld, nir_alu_instr *alu)
45 {
46    nir_ssa_def *numer, *denom, *af, *bf, *a, *b, *q, *r, *rt;
47    nir_op op = alu->op;
48    bool is_signed;
49 
50    if ((op != nir_op_idiv) &&
51        (op != nir_op_udiv) &&
52        (op != nir_op_imod) &&
53        (op != nir_op_umod) &&
54        (op != nir_op_irem))
55       return false;
56 
57    is_signed = (op == nir_op_idiv ||
58                 op == nir_op_imod ||
59                 op == nir_op_irem);
60 
61    bld->cursor = nir_before_instr(&alu->instr);
62 
63    numer = nir_ssa_for_alu_src(bld, alu, 0);
64    denom = nir_ssa_for_alu_src(bld, alu, 1);
65 
66    if (is_signed) {
67       af = nir_i2f32(bld, numer);
68       bf = nir_i2f32(bld, denom);
69       af = nir_fabs(bld, af);
70       bf = nir_fabs(bld, bf);
71       a  = nir_iabs(bld, numer);
72       b  = nir_iabs(bld, denom);
73    } else {
74       af = nir_u2f32(bld, numer);
75       bf = nir_u2f32(bld, denom);
76       a  = numer;
77       b  = denom;
78    }
79 
80    /* get first result: */
81    bf = nir_frcp(bld, bf);
82    bf = nir_isub(bld, bf, nir_imm_int(bld, 2));  /* yes, really */
83    q  = nir_fmul(bld, af, bf);
84 
85    if (is_signed) {
86       q = nir_f2i32(bld, q);
87    } else {
88       q = nir_f2u32(bld, q);
89    }
90 
91    /* get error of first result: */
92    r = nir_imul(bld, q, b);
93    r = nir_isub(bld, a, r);
94    r = nir_u2f32(bld, r);
95    r = nir_fmul(bld, r, bf);
96    r = nir_f2u32(bld, r);
97 
98    /* add quotients: */
99    q = nir_iadd(bld, q, r);
100 
101    /* correction: if modulus >= divisor, add 1 */
102    r = nir_imul(bld, q, b);
103    r = nir_isub(bld, a, r);
104    rt = nir_uge(bld, r, b);
105 
106    if (op == nir_op_umod) {
107       q = nir_bcsel(bld, rt, nir_isub(bld, r, b), r);
108    } else {
109       r = nir_b2i32(bld, rt);
110 
111       q = nir_iadd(bld, q, r);
112       if (is_signed)  {
113          /* fix the sign: */
114          r = nir_ixor(bld, numer, denom);
115          r = nir_ilt(bld, r, nir_imm_int(bld, 0));
116          b = nir_ineg(bld, q);
117          q = nir_bcsel(bld, r, b, q);
118 
119          if (op == nir_op_imod || op == nir_op_irem) {
120             q = nir_imul(bld, q, denom);
121             q = nir_isub(bld, numer, q);
122             if (op == nir_op_imod) {
123                q = nir_bcsel(bld, nir_ieq_imm(bld, q, 0),
124                              nir_imm_int(bld, 0),
125                              nir_bcsel(bld, r, nir_iadd(bld, q, denom), q));
126             }
127          }
128       }
129    }
130 
131    assert(alu->dest.dest.is_ssa);
132    nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(q));
133 
134    return true;
135 }
136 
137 /* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
138 static nir_ssa_def *
emit_udiv(nir_builder * bld,nir_ssa_def * numer,nir_ssa_def * denom,bool modulo)139 emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo)
140 {
141    nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom));
142    rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294966784.0));
143 
144    nir_ssa_def *neg_rcp_times_denom =
145       nir_imul(bld, rcp, nir_ineg(bld, denom));
146    rcp = nir_iadd(bld, rcp, nir_umul_high(bld, rcp, neg_rcp_times_denom));
147 
148    /* Get initial estimate for quotient/remainder, then refine the estimate
149     * in two iterations after */
150    nir_ssa_def *quotient = nir_umul_high(bld, numer, rcp);
151    nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom);
152    nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder);
153 
154    /* First refinement step */
155    nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom);
156    if (!modulo) {
157       quotient = nir_bcsel(bld, remainder_ge_den,
158                            nir_iadd_imm(bld, quotient, 1), quotient);
159    }
160    remainder = nir_bcsel(bld, remainder_ge_den,
161                          nir_isub(bld, remainder, denom), remainder);
162 
163    /* Second refinement step */
164    remainder_ge_den = nir_uge(bld, remainder, denom);
165    if (modulo) {
166       return nir_bcsel(bld, remainder_ge_den, nir_isub(bld, remainder, denom),
167                        remainder);
168    } else {
169       return nir_bcsel(bld, remainder_ge_den, nir_iadd_imm(bld, quotient, 1),
170                        quotient);
171    }
172 }
173 
174 /* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */
175 static nir_ssa_def *
emit_idiv(nir_builder * bld,nir_ssa_def * numer,nir_ssa_def * denom,nir_op op)176 emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op)
177 {
178    nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0));
179    nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0));
180    lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
181    rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
182 
183    nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign);
184    nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign);
185    lhs = nir_ixor(bld, lhs, lh_sign);
186    rhs = nir_ixor(bld, rhs, rh_sign);
187 
188    if (op == nir_op_idiv) {
189       nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign);
190       nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false);
191       res = nir_ixor(bld, res, d_sign);
192       return nir_isub(bld, res, d_sign);
193    } else {
194       nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true);
195       res = nir_ixor(bld, res, lh_sign);
196       res = nir_isub(bld, res, lh_sign);
197       if (op == nir_op_imod) {
198          nir_ssa_def *cond = nir_ieq_imm(bld, res, 0);
199          cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond);
200          res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom));
201       }
202       return res;
203    }
204 }
205 
206 static bool
convert_instr_precise(nir_builder * bld,nir_alu_instr * alu)207 convert_instr_precise(nir_builder *bld, nir_alu_instr *alu)
208 {
209    nir_op op = alu->op;
210 
211    if ((op != nir_op_idiv) &&
212        (op != nir_op_imod) &&
213        (op != nir_op_irem) &&
214        (op != nir_op_udiv) &&
215        (op != nir_op_umod))
216       return false;
217 
218    if (alu->dest.dest.ssa.bit_size != 32)
219       return false;
220 
221    bld->cursor = nir_before_instr(&alu->instr);
222 
223    nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0);
224    nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1);
225 
226    nir_ssa_def *res = NULL;
227 
228    if (op == nir_op_udiv || op == nir_op_umod)
229       res = emit_udiv(bld, numer, denom, op == nir_op_umod);
230    else
231       res = emit_idiv(bld, numer, denom, op);
232 
233    assert(alu->dest.dest.is_ssa);
234    nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res));
235 
236    return true;
237 }
238 
239 static bool
convert_impl(nir_function_impl * impl,enum nir_lower_idiv_path path)240 convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path)
241 {
242    nir_builder b;
243    nir_builder_init(&b, impl);
244    bool progress = false;
245 
246    nir_foreach_block(block, impl) {
247       nir_foreach_instr_safe(instr, block) {
248          if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise)
249             progress |= convert_instr_precise(&b, nir_instr_as_alu(instr));
250          else if (instr->type == nir_instr_type_alu)
251             progress |= convert_instr(&b, nir_instr_as_alu(instr));
252       }
253    }
254 
255    nir_metadata_preserve(impl, nir_metadata_block_index |
256                                nir_metadata_dominance);
257 
258    return progress;
259 }
260 
261 bool
nir_lower_idiv(nir_shader * shader,enum nir_lower_idiv_path path)262 nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path)
263 {
264    bool progress = false;
265 
266    nir_foreach_function(function, shader) {
267       if (function->impl)
268          progress |= convert_impl(function->impl, path);
269    }
270 
271    return progress;
272 }
273