• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Jonathan Marek <jonathan@marek.ca>
25  */
26 
27 #include "ir2_private.h"
28 
29 static bool
scalar_possible(struct ir2_instr * instr)30 scalar_possible(struct ir2_instr *instr)
31 {
32    if (instr->alu.scalar_opc == SCALAR_NONE)
33       return false;
34 
35    return src_ncomp(instr) == 1;
36 }
37 
38 static bool
is_alu_compatible(struct ir2_instr * a,struct ir2_instr * b)39 is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
40 {
41    if (!a)
42       return true;
43 
44    /* dont use same instruction twice */
45    if (a == b)
46       return false;
47 
48    /* PRED_SET must be alone */
49    if (b->alu.scalar_opc >= PRED_SETEs &&
50        b->alu.scalar_opc <= PRED_SET_RESTOREs)
51       return false;
52 
53    /* must write to same export (issues otherwise?) */
54    return a->alu.export == b->alu.export;
55 }
56 
57 /* priority of vector instruction for scheduling (lower=higher prio) */
58 static unsigned
alu_vector_prio(struct ir2_instr * instr)59 alu_vector_prio(struct ir2_instr *instr)
60 {
61    if (instr->alu.vector_opc == VECTOR_NONE)
62       return ~0u;
63 
64    if (is_export(instr))
65       return 4;
66 
67    /* TODO check src type and ncomps */
68    if (instr->src_count == 3)
69       return 0;
70 
71    if (!scalar_possible(instr))
72       return 1;
73 
74    return instr->src_count == 2 ? 2 : 3;
75 }
76 
77 /* priority of scalar instruction for scheduling (lower=higher prio) */
78 static unsigned
alu_scalar_prio(struct ir2_instr * instr)79 alu_scalar_prio(struct ir2_instr *instr)
80 {
81    if (!scalar_possible(instr))
82       return ~0u;
83 
84    /* this case is dealt with later */
85    if (instr->src_count > 1)
86       return ~0u;
87 
88    if (is_export(instr))
89       return 4;
90 
91    /* PRED to end of block */
92    if (instr->alu.scalar_opc >= PRED_SETEs &&
93        instr->alu.scalar_opc <= PRED_SET_RESTOREs)
94       return 5;
95 
96    /* scalar only have highest priority */
97    return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
98 }
99 
100 /* this is a bit messy:
101  * we want to find a slot where we can insert a scalar MOV with
102  * a vector instruction that was already scheduled
103  */
104 static struct ir2_sched_instr *
insert(struct ir2_context * ctx,unsigned block_idx,unsigned reg_idx,struct ir2_src src1,unsigned * comp)105 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
106        struct ir2_src src1, unsigned *comp)
107 {
108    struct ir2_sched_instr *sched = NULL, *s;
109    unsigned i, mask = 0xf;
110 
111    /* go first earliest point where the mov can be inserted */
112    for (i = ctx->instr_sched_count - 1; i > 0; i--) {
113       s = &ctx->instr_sched[i - 1];
114 
115       if (s->instr && s->instr->block_idx != block_idx)
116          break;
117       if (s->instr_s && s->instr_s->block_idx != block_idx)
118          break;
119 
120       if (src1.type == IR2_SRC_SSA) {
121          if ((s->instr && s->instr->idx == src1.num) ||
122              (s->instr_s && s->instr_s->idx == src1.num))
123             break;
124       }
125 
126       unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
127       if ((mask & mr) == 0)
128          break;
129 
130       mask &= mr;
131       if (s->instr_s || s->instr->src_count == 3)
132          continue;
133 
134       if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
135          continue;
136 
137       sched = s;
138    }
139    *comp = ffs(mask) - 1;
140 
141    if (sched) {
142       for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
143          s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
144    }
145 
146    return sched;
147 }
148 
149 /* case1:
150  * in this case, insert a mov to place the 2nd src into to same reg
151  * (scalar sources come from the same register)
152  *
153  * this is a common case which works when one of the srcs is input/const
154  * but for instrs which have 2 ssa/reg srcs, then its not ideal
155  */
156 static bool
scalarize_case1(struct ir2_context * ctx,struct ir2_instr * instr,bool order)157 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
158 {
159    struct ir2_src src0 = instr->src[order];
160    struct ir2_src src1 = instr->src[!order];
161    struct ir2_sched_instr *sched;
162    struct ir2_instr *ins;
163    struct ir2_reg *reg;
164    unsigned idx, comp;
165 
166    switch (src0.type) {
167    case IR2_SRC_CONST:
168    case IR2_SRC_INPUT:
169       return false;
170    default:
171       break;
172    }
173 
174    /* TODO, insert needs logic for this */
175    if (src1.type == IR2_SRC_REG)
176       return false;
177 
178    /* we could do something if they match src1.. */
179    if (src0.negate || src0.abs)
180       return false;
181 
182    reg = get_reg_src(ctx, &src0);
183 
184    /* result not used more since we will overwrite */
185    for (int i = 0; i < 4; i++)
186       if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
187          return false;
188 
189    /* find a place to insert the mov */
190    sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
191    if (!sched)
192       return false;
193 
194    ins = &ctx->instr[idx = ctx->instr_count++];
195    ins->idx = idx;
196    ins->type = IR2_ALU;
197    ins->src[0] = src1;
198    ins->src_count = 1;
199    ins->is_ssa = true;
200    ins->ssa.idx = reg->idx;
201    ins->ssa.ncomp = 1;
202    ins->ssa.comp[0].c = comp;
203    ins->alu.scalar_opc = MAXs;
204    ins->alu.export = -1;
205    ins->alu.write_mask = 1;
206    ins->pred = instr->pred;
207    ins->block_idx = instr->block_idx;
208 
209    instr->src[0] = src0;
210    instr->alu.src1_swizzle = comp;
211 
212    sched->instr_s = ins;
213    return true;
214 }
215 
216 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
217 static int
sched_next(struct ir2_context * ctx,struct ir2_sched_instr * sched)218 sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
219 {
220    struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
221    unsigned avail_count = 0;
222 
223    instr_alloc_type_t export = ~0u;
224    int block_idx = -1;
225 
226    /* XXX merge this loop with the other one somehow? */
227    ir2_foreach_instr (instr, ctx) {
228       if (!instr->need_emit)
229          continue;
230       if (is_export(instr))
231          export = MIN2(export, export_buf(instr->alu.export));
232    }
233 
234    ir2_foreach_instr (instr, ctx) {
235       if (!instr->need_emit)
236          continue;
237 
238       /* dont mix exports */
239       if (is_export(instr) && export_buf(instr->alu.export) != export)
240          continue;
241 
242       if (block_idx < 0)
243          block_idx = instr->block_idx;
244       else if (block_idx != instr->block_idx || /* must be same block */
245                instr->type == IR2_CF ||         /* CF/MEM must be alone */
246                (is_export(instr) && export == SQ_MEMORY))
247          break;
248       /* it works because IR2_CF is always at end of block
249        * and somewhat same idea with MEM exports, which might not be alone
250        * but will end up in-order at least
251        */
252 
253       /* check if dependencies are satisfied */
254       bool is_ok = true;
255       ir2_foreach_src (src, instr) {
256          if (src->type == IR2_SRC_REG) {
257             /* need to check if all previous instructions in the block
258              * which write the reg have been emitted
259              * slow..
260              * XXX: check components instead of whole register
261              */
262             struct ir2_reg *reg = get_reg_src(ctx, src);
263             ir2_foreach_instr (p, ctx) {
264                if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
265                   is_ok &= !p->need_emit;
266             }
267          } else if (src->type == IR2_SRC_SSA) {
268             /* in this case its easy, just check need_emit */
269             is_ok &= !ctx->instr[src->num].need_emit;
270          }
271       }
272       /* don't reorder non-ssa write before read */
273       if (!instr->is_ssa) {
274          ir2_foreach_instr (p, ctx) {
275             if (!p->need_emit || p->idx >= instr->idx)
276                continue;
277 
278             ir2_foreach_src (src, p) {
279                if (get_reg_src(ctx, src) == instr->reg)
280                   is_ok = false;
281             }
282          }
283       }
284       /* don't reorder across predicates */
285       if (avail_count && instr->pred != avail[0]->pred)
286          is_ok = false;
287 
288       if (!is_ok)
289          continue;
290 
291       avail[avail_count++] = instr;
292    }
293 
294    if (!avail_count) {
295       assert(block_idx == -1);
296       return -1;
297    }
298 
299    /* priority to FETCH instructions */
300    ir2_foreach_avail (instr) {
301       if (instr->type == IR2_ALU)
302          continue;
303 
304       ra_src_free(ctx, instr);
305       ra_reg(ctx, get_reg(instr), -1, false, 0);
306 
307       instr->need_emit = false;
308       sched->instr = instr;
309       sched->instr_s = NULL;
310       return block_idx;
311    }
312 
313    /* TODO precompute priorities */
314 
315    unsigned prio_v = ~0u, prio_s = ~0u, prio;
316    ir2_foreach_avail (instr) {
317       prio = alu_vector_prio(instr);
318       if (prio < prio_v) {
319          instr_v = instr;
320          prio_v = prio;
321       }
322    }
323 
324    /* TODO can still insert scalar if src_count=3, if smart about it */
325    if (!instr_v || instr_v->src_count < 3) {
326       ir2_foreach_avail (instr) {
327          bool compat = is_alu_compatible(instr_v, instr);
328 
329          prio = alu_scalar_prio(instr);
330          if (prio >= prio_v && !compat)
331             continue;
332 
333          if (prio < prio_s) {
334             instr_s = instr;
335             prio_s = prio;
336             if (!compat)
337                instr_v = NULL;
338          }
339       }
340    }
341 
342    assert(instr_v || instr_s);
343 
344    /* now, we try more complex insertion of vector instruction as scalar
345     * TODO: if we are smart we can still insert if instr_v->src_count==3
346     */
347    if (!instr_s && instr_v->src_count < 3) {
348       ir2_foreach_avail (instr) {
349          if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
350             continue;
351 
352          /* at this point, src_count should always be 2 */
353          assert(instr->src_count == 2);
354 
355          if (scalarize_case1(ctx, instr, 0)) {
356             instr_s = instr;
357             break;
358          }
359          if (scalarize_case1(ctx, instr, 1)) {
360             instr_s = instr;
361             break;
362          }
363       }
364    }
365 
366    /* free src registers */
367    if (instr_v) {
368       instr_v->need_emit = false;
369       ra_src_free(ctx, instr_v);
370    }
371 
372    if (instr_s) {
373       instr_s->need_emit = false;
374       ra_src_free(ctx, instr_s);
375    }
376 
377    /* allocate dst registers */
378    if (instr_v)
379       ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
380              instr_v->alu.write_mask);
381 
382    if (instr_s)
383       ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
384              instr_s->alu.write_mask);
385 
386    sched->instr = instr_v;
387    sched->instr_s = instr_s;
388    return block_idx;
389 }
390 
391 /* scheduling: determine order of instructions */
392 static void
schedule_instrs(struct ir2_context * ctx)393 schedule_instrs(struct ir2_context *ctx)
394 {
395    struct ir2_sched_instr *sched;
396    int block_idx;
397 
398    /* allocate input registers */
399    for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
400       if (ctx->input[idx].initialized)
401          ra_reg(ctx, &ctx->input[idx], idx, false, 0);
402 
403    for (;;) {
404       sched = &ctx->instr_sched[ctx->instr_sched_count++];
405       block_idx = sched_next(ctx, sched);
406       if (block_idx < 0)
407          break;
408       memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
409 
410       /* catch texture fetch after scheduling and insert the
411        * SET_TEX_LOD right before it if necessary
412        * TODO clean this up
413        */
414       struct ir2_instr *instr = sched->instr, *tex_lod;
415       if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
416           instr->src_count == 2) {
417          /* generate the SET_LOD instruction */
418          tex_lod = &ctx->instr[ctx->instr_count++];
419          tex_lod->type = IR2_FETCH;
420          tex_lod->block_idx = instr->block_idx;
421          tex_lod->pred = instr->pred;
422          tex_lod->fetch.opc = TEX_SET_TEX_LOD;
423          tex_lod->src[0] = instr->src[1];
424          tex_lod->src_count = 1;
425 
426          sched[1] = sched[0];
427          sched->instr = tex_lod;
428          ctx->instr_sched_count++;
429       }
430 
431       bool free_block = true;
432       ir2_foreach_instr (instr, ctx)
433          free_block &= instr->block_idx != block_idx;
434       if (free_block)
435          ra_block_free(ctx, block_idx);
436    };
437    ctx->instr_sched_count--;
438 }
439 
440 void
ir2_compile(struct fd2_shader_stateobj * so,unsigned variant,struct fd2_shader_stateobj * fp)441 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
442             struct fd2_shader_stateobj *fp)
443 {
444    struct ir2_context ctx = {};
445    bool binning = !fp && so->type == MESA_SHADER_VERTEX;
446 
447    if (fp)
448       so->variant[variant].f = fp->variant[0].f;
449 
450    ctx.so = so;
451    ctx.info = &so->variant[variant].info;
452    ctx.f = &so->variant[variant].f;
453    ctx.info->max_reg = -1;
454 
455    /* convert nir to internal representation */
456    ir2_nir_compile(&ctx, binning);
457 
458    /* copy propagate srcs */
459    cp_src(&ctx);
460 
461    /* get ref_counts and kill non-needed instructions */
462    ra_count_refs(&ctx);
463 
464    /* remove movs used to write outputs */
465    cp_export(&ctx);
466 
467    /* instruction order.. and vector->scalar conversions */
468    schedule_instrs(&ctx);
469 
470    /* finally, assemble to bitcode */
471    assemble(&ctx, binning);
472 }
473