1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 static bool
scalar_possible(struct ir2_instr * instr)30 scalar_possible(struct ir2_instr *instr)
31 {
32 if (instr->alu.scalar_opc == SCALAR_NONE)
33 return false;
34
35 return src_ncomp(instr) == 1;
36 }
37
38 static bool
is_alu_compatible(struct ir2_instr * a,struct ir2_instr * b)39 is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
40 {
41 if (!a)
42 return true;
43
44 /* dont use same instruction twice */
45 if (a == b)
46 return false;
47
48 /* PRED_SET must be alone */
49 if (b->alu.scalar_opc >= PRED_SETEs &&
50 b->alu.scalar_opc <= PRED_SET_RESTOREs)
51 return false;
52
53 /* must write to same export (issues otherwise?) */
54 return a->alu.export == b->alu.export;
55 }
56
57 /* priority of vector instruction for scheduling (lower=higher prio) */
58 static unsigned
alu_vector_prio(struct ir2_instr * instr)59 alu_vector_prio(struct ir2_instr *instr)
60 {
61 if (instr->alu.vector_opc == VECTOR_NONE)
62 return ~0u;
63
64 if (is_export(instr))
65 return 4;
66
67 /* TODO check src type and ncomps */
68 if (instr->src_count == 3)
69 return 0;
70
71 if (!scalar_possible(instr))
72 return 1;
73
74 return instr->src_count == 2 ? 2 : 3;
75 }
76
77 /* priority of scalar instruction for scheduling (lower=higher prio) */
78 static unsigned
alu_scalar_prio(struct ir2_instr * instr)79 alu_scalar_prio(struct ir2_instr *instr)
80 {
81 if (!scalar_possible(instr))
82 return ~0u;
83
84 /* this case is dealt with later */
85 if (instr->src_count > 1)
86 return ~0u;
87
88 if (is_export(instr))
89 return 4;
90
91 /* PRED to end of block */
92 if (instr->alu.scalar_opc >= PRED_SETEs &&
93 instr->alu.scalar_opc <= PRED_SET_RESTOREs)
94 return 5;
95
96 /* scalar only have highest priority */
97 return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
98 }
99
100 /* this is a bit messy:
101 * we want to find a slot where we can insert a scalar MOV with
102 * a vector instruction that was already scheduled
103 */
104 static struct ir2_sched_instr *
insert(struct ir2_context * ctx,unsigned block_idx,unsigned reg_idx,struct ir2_src src1,unsigned * comp)105 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
106 struct ir2_src src1, unsigned *comp)
107 {
108 struct ir2_sched_instr *sched = NULL, *s;
109 unsigned i, mask = 0xf;
110
111 /* go first earliest point where the mov can be inserted */
112 for (i = ctx->instr_sched_count - 1; i > 0; i--) {
113 s = &ctx->instr_sched[i - 1];
114
115 if (s->instr && s->instr->block_idx != block_idx)
116 break;
117 if (s->instr_s && s->instr_s->block_idx != block_idx)
118 break;
119
120 if (src1.type == IR2_SRC_SSA) {
121 if ((s->instr && s->instr->idx == src1.num) ||
122 (s->instr_s && s->instr_s->idx == src1.num))
123 break;
124 }
125
126 unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
127 if ((mask & mr) == 0)
128 break;
129
130 mask &= mr;
131 if (s->instr_s || s->instr->src_count == 3)
132 continue;
133
134 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
135 continue;
136
137 sched = s;
138 }
139 *comp = ffs(mask) - 1;
140
141 if (sched) {
142 for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
143 s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
144 }
145
146 return sched;
147 }
148
149 /* case1:
150 * in this case, insert a mov to place the 2nd src into to same reg
151 * (scalar sources come from the same register)
152 *
153 * this is a common case which works when one of the srcs is input/const
154 * but for instrs which have 2 ssa/reg srcs, then its not ideal
155 */
156 static bool
scalarize_case1(struct ir2_context * ctx,struct ir2_instr * instr,bool order)157 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
158 {
159 struct ir2_src src0 = instr->src[order];
160 struct ir2_src src1 = instr->src[!order];
161 struct ir2_sched_instr *sched;
162 struct ir2_instr *ins;
163 struct ir2_reg *reg;
164 unsigned idx, comp;
165
166 switch (src0.type) {
167 case IR2_SRC_CONST:
168 case IR2_SRC_INPUT:
169 return false;
170 default:
171 break;
172 }
173
174 /* TODO, insert needs logic for this */
175 if (src1.type == IR2_SRC_REG)
176 return false;
177
178 /* we could do something if they match src1.. */
179 if (src0.negate || src0.abs)
180 return false;
181
182 reg = get_reg_src(ctx, &src0);
183
184 /* result not used more since we will overwrite */
185 for (int i = 0; i < 4; i++)
186 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
187 return false;
188
189 /* find a place to insert the mov */
190 sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
191 if (!sched)
192 return false;
193
194 ins = &ctx->instr[idx = ctx->instr_count++];
195 ins->idx = idx;
196 ins->type = IR2_ALU;
197 ins->src[0] = src1;
198 ins->src_count = 1;
199 ins->is_ssa = true;
200 ins->ssa.idx = reg->idx;
201 ins->ssa.ncomp = 1;
202 ins->ssa.comp[0].c = comp;
203 ins->alu.scalar_opc = MAXs;
204 ins->alu.export = -1;
205 ins->alu.write_mask = 1;
206 ins->pred = instr->pred;
207 ins->block_idx = instr->block_idx;
208
209 instr->src[0] = src0;
210 instr->alu.src1_swizzle = comp;
211
212 sched->instr_s = ins;
213 return true;
214 }
215
216 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
217 static int
sched_next(struct ir2_context * ctx,struct ir2_sched_instr * sched)218 sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
219 {
220 struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
221 unsigned avail_count = 0;
222
223 instr_alloc_type_t export = ~0u;
224 int block_idx = -1;
225
226 /* XXX merge this loop with the other one somehow? */
227 ir2_foreach_instr (instr, ctx) {
228 if (!instr->need_emit)
229 continue;
230 if (is_export(instr))
231 export = MIN2(export, export_buf(instr->alu.export));
232 }
233
234 ir2_foreach_instr (instr, ctx) {
235 if (!instr->need_emit)
236 continue;
237
238 /* dont mix exports */
239 if (is_export(instr) && export_buf(instr->alu.export) != export)
240 continue;
241
242 if (block_idx < 0)
243 block_idx = instr->block_idx;
244 else if (block_idx != instr->block_idx || /* must be same block */
245 instr->type == IR2_CF || /* CF/MEM must be alone */
246 (is_export(instr) && export == SQ_MEMORY))
247 break;
248 /* it works because IR2_CF is always at end of block
249 * and somewhat same idea with MEM exports, which might not be alone
250 * but will end up in-order at least
251 */
252
253 /* check if dependencies are satisfied */
254 bool is_ok = true;
255 ir2_foreach_src (src, instr) {
256 if (src->type == IR2_SRC_REG) {
257 /* need to check if all previous instructions in the block
258 * which write the reg have been emitted
259 * slow..
260 * XXX: check components instead of whole register
261 */
262 struct ir2_reg *reg = get_reg_src(ctx, src);
263 ir2_foreach_instr (p, ctx) {
264 if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
265 is_ok &= !p->need_emit;
266 }
267 } else if (src->type == IR2_SRC_SSA) {
268 /* in this case its easy, just check need_emit */
269 is_ok &= !ctx->instr[src->num].need_emit;
270 }
271 }
272 /* don't reorder non-ssa write before read */
273 if (!instr->is_ssa) {
274 ir2_foreach_instr (p, ctx) {
275 if (!p->need_emit || p->idx >= instr->idx)
276 continue;
277
278 ir2_foreach_src (src, p) {
279 if (get_reg_src(ctx, src) == instr->reg)
280 is_ok = false;
281 }
282 }
283 }
284 /* don't reorder across predicates */
285 if (avail_count && instr->pred != avail[0]->pred)
286 is_ok = false;
287
288 if (!is_ok)
289 continue;
290
291 avail[avail_count++] = instr;
292 }
293
294 if (!avail_count) {
295 assert(block_idx == -1);
296 return -1;
297 }
298
299 /* priority to FETCH instructions */
300 ir2_foreach_avail (instr) {
301 if (instr->type == IR2_ALU)
302 continue;
303
304 ra_src_free(ctx, instr);
305 ra_reg(ctx, get_reg(instr), -1, false, 0);
306
307 instr->need_emit = false;
308 sched->instr = instr;
309 sched->instr_s = NULL;
310 return block_idx;
311 }
312
313 /* TODO precompute priorities */
314
315 unsigned prio_v = ~0u, prio_s = ~0u, prio;
316 ir2_foreach_avail (instr) {
317 prio = alu_vector_prio(instr);
318 if (prio < prio_v) {
319 instr_v = instr;
320 prio_v = prio;
321 }
322 }
323
324 /* TODO can still insert scalar if src_count=3, if smart about it */
325 if (!instr_v || instr_v->src_count < 3) {
326 ir2_foreach_avail (instr) {
327 bool compat = is_alu_compatible(instr_v, instr);
328
329 prio = alu_scalar_prio(instr);
330 if (prio >= prio_v && !compat)
331 continue;
332
333 if (prio < prio_s) {
334 instr_s = instr;
335 prio_s = prio;
336 if (!compat)
337 instr_v = NULL;
338 }
339 }
340 }
341
342 assert(instr_v || instr_s);
343
344 /* now, we try more complex insertion of vector instruction as scalar
345 * TODO: if we are smart we can still insert if instr_v->src_count==3
346 */
347 if (!instr_s && instr_v->src_count < 3) {
348 ir2_foreach_avail (instr) {
349 if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
350 continue;
351
352 /* at this point, src_count should always be 2 */
353 assert(instr->src_count == 2);
354
355 if (scalarize_case1(ctx, instr, 0)) {
356 instr_s = instr;
357 break;
358 }
359 if (scalarize_case1(ctx, instr, 1)) {
360 instr_s = instr;
361 break;
362 }
363 }
364 }
365
366 /* free src registers */
367 if (instr_v) {
368 instr_v->need_emit = false;
369 ra_src_free(ctx, instr_v);
370 }
371
372 if (instr_s) {
373 instr_s->need_emit = false;
374 ra_src_free(ctx, instr_s);
375 }
376
377 /* allocate dst registers */
378 if (instr_v)
379 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
380 instr_v->alu.write_mask);
381
382 if (instr_s)
383 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
384 instr_s->alu.write_mask);
385
386 sched->instr = instr_v;
387 sched->instr_s = instr_s;
388 return block_idx;
389 }
390
391 /* scheduling: determine order of instructions */
392 static void
schedule_instrs(struct ir2_context * ctx)393 schedule_instrs(struct ir2_context *ctx)
394 {
395 struct ir2_sched_instr *sched;
396 int block_idx;
397
398 /* allocate input registers */
399 for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
400 if (ctx->input[idx].initialized)
401 ra_reg(ctx, &ctx->input[idx], idx, false, 0);
402
403 for (;;) {
404 sched = &ctx->instr_sched[ctx->instr_sched_count++];
405 block_idx = sched_next(ctx, sched);
406 if (block_idx < 0)
407 break;
408 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
409
410 /* catch texture fetch after scheduling and insert the
411 * SET_TEX_LOD right before it if necessary
412 * TODO clean this up
413 */
414 struct ir2_instr *instr = sched->instr, *tex_lod;
415 if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
416 instr->src_count == 2) {
417 /* generate the SET_LOD instruction */
418 tex_lod = &ctx->instr[ctx->instr_count++];
419 tex_lod->type = IR2_FETCH;
420 tex_lod->block_idx = instr->block_idx;
421 tex_lod->pred = instr->pred;
422 tex_lod->fetch.opc = TEX_SET_TEX_LOD;
423 tex_lod->src[0] = instr->src[1];
424 tex_lod->src_count = 1;
425
426 sched[1] = sched[0];
427 sched->instr = tex_lod;
428 ctx->instr_sched_count++;
429 }
430
431 bool free_block = true;
432 ir2_foreach_instr (instr, ctx)
433 free_block &= instr->block_idx != block_idx;
434 if (free_block)
435 ra_block_free(ctx, block_idx);
436 };
437 ctx->instr_sched_count--;
438 }
439
440 void
ir2_compile(struct fd2_shader_stateobj * so,unsigned variant,struct fd2_shader_stateobj * fp)441 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
442 struct fd2_shader_stateobj *fp)
443 {
444 struct ir2_context ctx = {};
445 bool binning = !fp && so->type == MESA_SHADER_VERTEX;
446
447 if (fp)
448 so->variant[variant].f = fp->variant[0].f;
449
450 ctx.so = so;
451 ctx.info = &so->variant[variant].info;
452 ctx.f = &so->variant[variant].f;
453 ctx.info->max_reg = -1;
454
455 /* convert nir to internal representation */
456 ir2_nir_compile(&ctx, binning);
457
458 /* copy propagate srcs */
459 cp_src(&ctx);
460
461 /* get ref_counts and kill non-needed instructions */
462 ra_count_refs(&ctx);
463
464 /* remove movs used to write outputs */
465 cp_export(&ctx);
466
467 /* instruction order.. and vector->scalar conversions */
468 schedule_instrs(&ctx);
469
470 /* finally, assemble to bitcode */
471 assemble(&ctx, binning);
472 }
473