• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Jonathan Marek <jonathan@marek.ca>
25  */
26 
27 #include "ir2_private.h"
28 
scalar_possible(struct ir2_instr * instr)29 static bool scalar_possible(struct ir2_instr *instr)
30 {
31 	if (instr->alu.scalar_opc == SCALAR_NONE)
32 		return false;
33 
34 	return src_ncomp(instr) == 1;
35 }
36 
is_alu_compatible(struct ir2_instr * a,struct ir2_instr * b)37 static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
38 {
39 	if (!a)
40 		return true;
41 
42 	/* dont use same instruction twice */
43 	if (a == b)
44 		return false;
45 
46 	/* PRED_SET must be alone */
47 	if (b->alu.scalar_opc >= PRED_SETEs &&
48 		b->alu.scalar_opc <= PRED_SET_RESTOREs)
49 		return false;
50 
51 	/* must write to same export (issues otherwise?) */
52 	return a->alu.export == b->alu.export;
53 }
54 
55 /* priority of vector instruction for scheduling (lower=higher prio) */
alu_vector_prio(struct ir2_instr * instr)56 static unsigned alu_vector_prio(struct ir2_instr *instr)
57 {
58 	if (instr->alu.vector_opc == VECTOR_NONE)
59 		return ~0u;
60 
61 	if (is_export(instr))
62 		return 4;
63 
64 	/* TODO check src type and ncomps */
65 	if (instr->src_count == 3)
66 		return 0;
67 
68 	if (!scalar_possible(instr))
69 		return 1;
70 
71 	return instr->src_count == 2 ? 2 : 3;
72 }
73 
74 /* priority of scalar instruction for scheduling (lower=higher prio) */
alu_scalar_prio(struct ir2_instr * instr)75 static unsigned alu_scalar_prio(struct ir2_instr *instr)
76 {
77 	if (!scalar_possible(instr))
78 		return ~0u;
79 
80 	/* this case is dealt with later */
81 	if (instr->src_count > 1)
82 		return ~0u;
83 
84 	if (is_export(instr))
85 		return 4;
86 
87 	/* PRED to end of block */
88 	if (instr->alu.scalar_opc >= PRED_SETEs &&
89 		instr->alu.scalar_opc <= PRED_SET_RESTOREs)
90 		return 5;
91 
92 	/* scalar only have highest priority */
93 	return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
94 }
95 
96 /* this is a bit messy:
97  * we want to find a slot where we can insert a scalar MOV with
98  * a vector instruction that was already scheduled
99  */
100 static struct ir2_sched_instr*
insert(struct ir2_context * ctx,unsigned block_idx,unsigned reg_idx,struct ir2_src src1,unsigned * comp)101 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
102 	struct ir2_src src1, unsigned *comp)
103 {
104 	struct ir2_sched_instr *sched = NULL, *s;
105 	unsigned i, mask = 0xf;
106 
107 	/* go first earliest point where the mov can be inserted */
108 	for (i = ctx->instr_sched_count-1; i > 0; i--) {
109 		s = &ctx->instr_sched[i - 1];
110 
111 		if (s->instr && s->instr->block_idx != block_idx)
112 			break;
113 		if (s->instr_s && s->instr_s->block_idx != block_idx)
114 			break;
115 
116 		if (src1.type == IR2_SRC_SSA) {
117 			if ((s->instr && s->instr->idx == src1.num) ||
118 				(s->instr_s && s->instr_s->idx == src1.num))
119 				break;
120 		}
121 
122 		unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
123 		if ((mask & mr) == 0)
124 			break;
125 
126 		mask &= mr;
127 		if (s->instr_s || s->instr->src_count == 3)
128 			continue;
129 
130 		if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
131 			continue;
132 
133 		sched = s;
134 	}
135 	*comp = ffs(mask) - 1;
136 
137 	if (sched) {
138 		for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
139 			s->reg_state[reg_idx/8] |= 1 << (*comp+reg_idx%8*4);
140 	}
141 
142 	return sched;
143 }
144 
145 /* case1:
146  * in this case, insert a mov to place the 2nd src into to same reg
147  * (scalar sources come from the same register)
148  *
149  * this is a common case which works when one of the srcs is input/const
150  * but for instrs which have 2 ssa/reg srcs, then its not ideal
151  */
152 static bool
scalarize_case1(struct ir2_context * ctx,struct ir2_instr * instr,bool order)153 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
154 {
155 	struct ir2_src src0 = instr->src[ order];
156 	struct ir2_src src1 = instr->src[!order];
157 	struct ir2_sched_instr *sched;
158 	struct ir2_instr *ins;
159 	struct ir2_reg *reg;
160 	unsigned idx, comp;
161 
162 	switch (src0.type) {
163 	case IR2_SRC_CONST:
164 	case IR2_SRC_INPUT:
165 		return false;
166 	default:
167 		break;
168 	}
169 
170 	/* TODO, insert needs logic for this */
171 	if (src1.type == IR2_SRC_REG)
172 		return false;
173 
174 	/* we could do something if they match src1.. */
175 	if (src0.negate || src0.abs)
176 		return false;
177 
178 	reg = get_reg_src(ctx, &src0);
179 
180 	/* result not used more since we will overwrite */
181 	for (int i = 0; i < 4; i++)
182 		if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
183 			return false;
184 
185 	/* find a place to insert the mov */
186 	sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
187 	if (!sched)
188 		return false;
189 
190 	ins = &ctx->instr[idx = ctx->instr_count++];
191 	ins->idx = idx;
192 	ins->type = IR2_ALU;
193 	ins->src[0] = src1;
194 	ins->src_count = 1;
195 	ins->is_ssa = true;
196 	ins->ssa.idx = reg->idx;
197 	ins->ssa.ncomp = 1;
198 	ins->ssa.comp[0].c = comp;
199 	ins->alu.scalar_opc = MAXs;
200 	ins->alu.export = -1;
201 	ins->alu.write_mask = 1;
202 	ins->pred = instr->pred;
203 	ins->block_idx = instr->block_idx;
204 
205 	instr->src[0] = src0;
206 	instr->alu.src1_swizzle = comp;
207 
208 	sched->instr_s = ins;
209 	return true;
210 }
211 
212 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
sched_next(struct ir2_context * ctx,struct ir2_sched_instr * sched)213 static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
214 {
215 	struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
216 	unsigned avail_count = 0;
217 
218 	instr_alloc_type_t export = ~0u;
219 	int block_idx = -1;
220 
221 	/* XXX merge this loop with the other one somehow? */
222 	ir2_foreach_instr(instr, ctx) {
223 		if (!instr->need_emit)
224 			continue;
225 		if (is_export(instr))
226 			export = MIN2(export, export_buf(instr->alu.export));
227 	}
228 
229 	ir2_foreach_instr(instr, ctx) {
230 		if (!instr->need_emit)
231 			continue;
232 
233 		/* dont mix exports */
234 		if (is_export(instr) && export_buf(instr->alu.export) != export)
235 			continue;
236 
237 		if (block_idx < 0)
238 			block_idx = instr->block_idx;
239 		else if (block_idx != instr->block_idx || /* must be same block */
240 			instr->type == IR2_CF || /* CF/MEM must be alone */
241 			(is_export(instr) && export == SQ_MEMORY))
242 			break;
243 		/* it works because IR2_CF is always at end of block
244 		 * and somewhat same idea with MEM exports, which might not be alone
245 		 * but will end up in-order at least
246 		 */
247 
248 		/* check if dependencies are satisfied */
249 		bool is_ok = true;
250 		ir2_foreach_src(src, instr) {
251 			if (src->type == IR2_SRC_REG) {
252 				/* need to check if all previous instructions in the block
253 				 * which write the reg have been emitted
254 				 * slow..
255 				 * XXX: check components instead of whole register
256 				 */
257 				struct ir2_reg *reg = get_reg_src(ctx, src);
258 				ir2_foreach_instr(p, ctx) {
259 					if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
260 						is_ok &= !p->need_emit;
261 				}
262 			} else if (src->type == IR2_SRC_SSA) {
263 				/* in this case its easy, just check need_emit */
264 				is_ok &= !ctx->instr[src->num].need_emit;
265 			}
266 		}
267 		/* don't reorder non-ssa write before read */
268 		if (!instr->is_ssa) {
269 			ir2_foreach_instr(p, ctx) {
270 				if (!p->need_emit || p->idx >= instr->idx)
271 					continue;
272 
273 				ir2_foreach_src(src, p) {
274 					if (get_reg_src(ctx, src) == instr->reg)
275 						is_ok = false;
276 				}
277 			}
278 		}
279 		/* don't reorder across predicates */
280 		if (avail_count && instr->pred != avail[0]->pred)
281 			is_ok = false;
282 
283 		if (!is_ok)
284 			continue;
285 
286 		avail[avail_count++] = instr;
287 	}
288 
289 	if (!avail_count) {
290 		assert(block_idx == -1);
291 		return -1;
292 	}
293 
294 	/* priority to FETCH instructions */
295 	ir2_foreach_avail(instr) {
296 		if (instr->type == IR2_ALU)
297 			continue;
298 
299 		ra_src_free(ctx, instr);
300 		ra_reg(ctx, get_reg(instr), -1, false, 0);
301 
302 		instr->need_emit = false;
303 		sched->instr = instr;
304 		sched->instr_s = NULL;
305 		return block_idx;
306 	}
307 
308 	/* TODO precompute priorities */
309 
310 	unsigned prio_v = ~0u, prio_s = ~0u, prio;
311 	ir2_foreach_avail(instr) {
312 		prio = alu_vector_prio(instr);
313 		if (prio < prio_v) {
314 			instr_v = instr;
315 			prio_v = prio;
316 		}
317 	}
318 
319 	/* TODO can still insert scalar if src_count=3, if smart about it */
320 	if (!instr_v || instr_v->src_count < 3) {
321 		ir2_foreach_avail(instr) {
322 			bool compat = is_alu_compatible(instr_v, instr);
323 
324 			prio = alu_scalar_prio(instr);
325 			if (prio >= prio_v && !compat)
326 				continue;
327 
328 			if (prio < prio_s) {
329 				instr_s = instr;
330 				prio_s = prio;
331 				if (!compat)
332 					instr_v = NULL;
333 			}
334 		}
335 	}
336 
337 	assert(instr_v || instr_s);
338 
339 	/* now, we try more complex insertion of vector instruction as scalar
340 	 * TODO: if we are smart we can still insert if instr_v->src_count==3
341 	 */
342 	if (!instr_s && instr_v->src_count < 3) {
343 		ir2_foreach_avail(instr) {
344 			if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
345 				continue;
346 
347 			/* at this point, src_count should always be 2 */
348 			assert(instr->src_count == 2);
349 
350 			if (scalarize_case1(ctx, instr, 0)) {
351 				instr_s = instr;
352 				break;
353 			}
354 			if (scalarize_case1(ctx, instr, 1)) {
355 				instr_s = instr;
356 				break;
357 			}
358 		}
359 	}
360 
361 	/* free src registers */
362 	if (instr_v) {
363 		instr_v->need_emit = false;
364 		ra_src_free(ctx, instr_v);
365 	}
366 
367 	if (instr_s) {
368 		instr_s->need_emit = false;
369 		ra_src_free(ctx, instr_s);
370 	}
371 
372 	/* allocate dst registers */
373 	if (instr_v)
374 		ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
375 
376 	if (instr_s)
377 		ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
378 
379 	sched->instr = instr_v;
380 	sched->instr_s = instr_s;
381 	return block_idx;
382 }
383 
384 /* scheduling: determine order of instructions */
schedule_instrs(struct ir2_context * ctx)385 static void schedule_instrs(struct ir2_context *ctx)
386 {
387 	struct ir2_sched_instr *sched;
388 	int block_idx;
389 
390 	/* allocate input registers */
391 	for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
392 		if (ctx->input[idx].initialized)
393 			ra_reg(ctx, &ctx->input[idx], idx, false, 0);
394 
395 	for (;;) {
396 		sched = &ctx->instr_sched[ctx->instr_sched_count++];
397 		block_idx = sched_next(ctx, sched);
398 		if (block_idx < 0)
399 			break;
400 		memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
401 
402 		/* catch texture fetch after scheduling and insert the
403 		 * SET_TEX_LOD right before it if necessary
404 		 * TODO clean this up
405 		 */
406 		struct ir2_instr *instr = sched->instr, *tex_lod;
407 		if (instr && instr->type == IR2_FETCH &&
408 			instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
409 			/* generate the SET_LOD instruction */
410 			tex_lod = &ctx->instr[ctx->instr_count++];
411 			tex_lod->type = IR2_FETCH;
412 			tex_lod->block_idx = instr->block_idx;
413 			tex_lod->pred = instr->pred;
414 			tex_lod->fetch.opc = TEX_SET_TEX_LOD;
415 			tex_lod->src[0] = instr->src[1];
416 			tex_lod->src_count = 1;
417 
418 			sched[1] = sched[0];
419 			sched->instr = tex_lod;
420 			ctx->instr_sched_count++;
421 		}
422 
423 		bool free_block = true;
424 		ir2_foreach_instr(instr, ctx)
425 			free_block &= instr->block_idx != block_idx;
426 		if (free_block)
427 			ra_block_free(ctx, block_idx);
428 	};
429 	ctx->instr_sched_count--;
430 }
431 
432 void
ir2_compile(struct fd2_shader_stateobj * so,unsigned variant,struct fd2_shader_stateobj * fp)433 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
434 		struct fd2_shader_stateobj *fp)
435 {
436 	struct ir2_context ctx = { };
437 	bool binning = !fp && so->type == MESA_SHADER_VERTEX;
438 
439 	if (fp)
440 		so->variant[variant].f = fp->variant[0].f;
441 
442 	ctx.so = so;
443 	ctx.info = &so->variant[variant].info;
444 	ctx.f = &so->variant[variant].f;
445 	ctx.info->max_reg = -1;
446 
447 	/* convert nir to internal representation */
448 	ir2_nir_compile(&ctx, binning);
449 
450 	/* copy propagate srcs */
451 	cp_src(&ctx);
452 
453 	/* get ref_counts and kill non-needed instructions */
454 	ra_count_refs(&ctx);
455 
456 	/* remove movs used to write outputs */
457 	cp_export(&ctx);
458 
459 	/* instruction order.. and vector->scalar conversions */
460 	schedule_instrs(&ctx);
461 
462 	/* finally, assemble to bitcode */
463 	assemble(&ctx, binning);
464 }
465