• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28 
29 #include "sb/sb_public.h"
30 
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41 
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44 
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62 
63 /* Contents of r0 on entry to various shaders
64 
65  VS - .x = VertexID
66       .y = RelVertexID (??)
67       .w = InstanceID
68 
69  GS - r0.xyw, r1.xyz = per-vertex offsets
70       r0.z = PrimitiveID
71 
72  TCS - .x = PatchID
73        .y = RelPatchID (??)
74        .z = InvocationID
75        .w = tess factor base.
76 
77  TES - .x = TessCoord.x
78      - .y = TessCoord.y
79      - .z = RelPatchID (??)
80      - .w = PrimitiveID
81 
82  PS - face_gpr.z = SampleMask
83       face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 				 struct r600_pipe_shader *pipeshader,
88 				 union r600_shader_key key);
89 
r600_add_gpr_array(struct r600_shader * ps,int start_gpr,int size,unsigned comp_mask)90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91                            int size, unsigned comp_mask) {
92 
93 	if (!size)
94 		return;
95 
96 	if (ps->num_arrays == ps->max_arrays) {
97 		ps->max_arrays += 64;
98 		ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 		                     sizeof(struct r600_shader_array));
100 	}
101 
102 	int n = ps->num_arrays;
103 	++ps->num_arrays;
104 
105 	ps->arrays[n].comp_mask = comp_mask;
106 	ps->arrays[n].gpr_start = start_gpr;
107 	ps->arrays[n].gpr_count = size;
108 }
109 
r600_dump_streamout(struct pipe_stream_output_info * so)110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 	unsigned i;
113 
114 	fprintf(stderr, "STREAMOUT\n");
115 	for (i = 0; i < so->num_outputs; i++) {
116 		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 				so->output[i].start_component;
118 		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 			i,
120 			so->output[i].stream,
121 			so->output[i].output_buffer,
122 			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 			so->output[i].register_index,
124 			mask & 1 ? "x" : "",
125 		        mask & 2 ? "y" : "",
126 		        mask & 4 ? "z" : "",
127 		        mask & 8 ? "w" : "",
128 			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 	}
130 }
131 
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)132 static int store_shader(struct pipe_context *ctx,
133 			struct r600_pipe_shader *shader)
134 {
135 	struct r600_context *rctx = (struct r600_context *)ctx;
136 	uint32_t *ptr, i;
137 
138 	if (shader->bo == NULL) {
139 		shader->bo = (struct r600_resource*)
140 			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 		if (shader->bo == NULL) {
142 			return -ENOMEM;
143 		}
144 		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 		if (R600_BIG_ENDIAN) {
146 			for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 			}
149 		} else {
150 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 		}
152 		rctx->b.ws->buffer_unmap(shader->bo->buf);
153 	}
154 
155 	return 0;
156 }
157 
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 			    struct r600_pipe_shader *shader,
160 			    union r600_shader_key key)
161 {
162 	struct r600_context *rctx = (struct r600_context *)ctx;
163 	struct r600_pipe_shader_selector *sel = shader->selector;
164 	int r;
165 	bool dump = r600_can_dump_shader(&rctx->screen->b,
166 					 tgsi_get_processor_type(sel->tokens));
167 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 	unsigned export_shader;
170 
171 	shader->shader.bc.isa = rctx->isa;
172 
173 	if (dump) {
174 		fprintf(stderr, "--------------------------------------------------------------\n");
175 		tgsi_dump(sel->tokens, 0);
176 
177 		if (sel->so.num_outputs) {
178 			r600_dump_streamout(&sel->so);
179 		}
180 	}
181 	r = r600_shader_from_tgsi(rctx, shader, key);
182 	if (r) {
183 		R600_ERR("translation from TGSI failed !\n");
184 		goto error;
185 	}
186 	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 		/* only disable for vertex shaders in tess paths */
188 		if (key.vs.as_ls)
189 			use_sb = 0;
190 	}
191 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 
194 	/* disable SB for shaders using doubles */
195 	use_sb &= !shader->shader.uses_doubles;
196 
197 	/* Check if the bytecode has already been built. */
198 	if (!shader->shader.bc.bytecode) {
199 		r = r600_bytecode_build(&shader->shader.bc);
200 		if (r) {
201 			R600_ERR("building bytecode failed !\n");
202 			goto error;
203 		}
204 	}
205 
206 	if (dump && !sb_disasm) {
207 		fprintf(stderr, "--------------------------------------------------------------\n");
208 		r600_bytecode_disasm(&shader->shader.bc);
209 		fprintf(stderr, "______________________________________________________________\n");
210 	} else if ((dump && sb_disasm) || use_sb) {
211 		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
212 		                             dump, use_sb);
213 		if (r) {
214 			R600_ERR("r600_sb_bytecode_process failed !\n");
215 			goto error;
216 		}
217 	}
218 
219 	if (shader->gs_copy_shader) {
220 		if (dump) {
221 			// dump copy shader
222 			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
223 						     &shader->gs_copy_shader->shader, dump, 0);
224 			if (r)
225 				goto error;
226 		}
227 
228 		if ((r = store_shader(ctx, shader->gs_copy_shader)))
229 			goto error;
230 	}
231 
232 	/* Store the shader in a buffer. */
233 	if ((r = store_shader(ctx, shader)))
234 		goto error;
235 
236 	/* Build state. */
237 	switch (shader->shader.processor_type) {
238 	case PIPE_SHADER_TESS_CTRL:
239 		evergreen_update_hs_state(ctx, shader);
240 		break;
241 	case PIPE_SHADER_TESS_EVAL:
242 		if (key.tes.as_es)
243 			evergreen_update_es_state(ctx, shader);
244 		else
245 			evergreen_update_vs_state(ctx, shader);
246 		break;
247 	case PIPE_SHADER_GEOMETRY:
248 		if (rctx->b.chip_class >= EVERGREEN) {
249 			evergreen_update_gs_state(ctx, shader);
250 			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
251 		} else {
252 			r600_update_gs_state(ctx, shader);
253 			r600_update_vs_state(ctx, shader->gs_copy_shader);
254 		}
255 		break;
256 	case PIPE_SHADER_VERTEX:
257 		export_shader = key.vs.as_es;
258 		if (rctx->b.chip_class >= EVERGREEN) {
259 			if (key.vs.as_ls)
260 				evergreen_update_ls_state(ctx, shader);
261 			else if (key.vs.as_es)
262 				evergreen_update_es_state(ctx, shader);
263 			else
264 				evergreen_update_vs_state(ctx, shader);
265 		} else {
266 			if (export_shader)
267 				r600_update_es_state(ctx, shader);
268 			else
269 				r600_update_vs_state(ctx, shader);
270 		}
271 		break;
272 	case PIPE_SHADER_FRAGMENT:
273 		if (rctx->b.chip_class >= EVERGREEN) {
274 			evergreen_update_ps_state(ctx, shader);
275 		} else {
276 			r600_update_ps_state(ctx, shader);
277 		}
278 		break;
279 	default:
280 		r = -EINVAL;
281 		goto error;
282 	}
283 	return 0;
284 
285 error:
286 	r600_pipe_shader_destroy(ctx, shader);
287 	return r;
288 }
289 
r600_pipe_shader_destroy(struct pipe_context * ctx,struct r600_pipe_shader * shader)290 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
291 {
292 	r600_resource_reference(&shader->bo, NULL);
293 	r600_bytecode_clear(&shader->shader.bc);
294 	r600_release_command_buffer(&shader->command_buffer);
295 }
296 
297 /*
298  * tgsi -> r600 shader
299  */
300 struct r600_shader_tgsi_instruction;
301 
302 struct r600_shader_src {
303 	unsigned				sel;
304 	unsigned				swizzle[4];
305 	unsigned				neg;
306 	unsigned				abs;
307 	unsigned				rel;
308 	unsigned				kc_bank;
309 	boolean					kc_rel; /* true if cache bank is indexed */
310 	uint32_t				value[4];
311 };
312 
313 struct eg_interp {
314 	boolean					enabled;
315 	unsigned				ij_index;
316 };
317 
318 struct r600_shader_ctx {
319 	struct tgsi_shader_info			info;
320 	struct tgsi_parse_context		parse;
321 	const struct tgsi_token			*tokens;
322 	unsigned				type;
323 	unsigned				file_offset[TGSI_FILE_COUNT];
324 	unsigned				temp_reg;
325 	const struct r600_shader_tgsi_instruction	*inst_info;
326 	struct r600_bytecode			*bc;
327 	struct r600_shader			*shader;
328 	struct r600_shader_src			src[4];
329 	uint32_t				*literals;
330 	uint32_t				nliterals;
331 	uint32_t				max_driver_temp_used;
332 	/* needed for evergreen interpolation */
333 	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
334 	/* evergreen/cayman also store sample mask in face register */
335 	int					face_gpr;
336 	/* sample id is .w component stored in fixed point position register */
337 	int					fixed_pt_position_gpr;
338 	int					colors_used;
339 	boolean                 clip_vertex_write;
340 	unsigned                cv_output;
341 	unsigned		edgeflag_output;
342 	int					fragcoord_input;
343 	int					native_integers;
344 	int					next_ring_offset;
345 	int					gs_out_ring_offset;
346 	int					gs_next_vertex;
347 	struct r600_shader	*gs_for_vs;
348 	int					gs_export_gpr_tregs[4];
349 	const struct pipe_stream_output_info	*gs_stream_output_info;
350 	unsigned				enabled_stream_buffers_mask;
351 	unsigned                                tess_input_info; /* temp with tess input offsets */
352 	unsigned                                tess_output_info; /* temp with tess input offsets */
353 };
354 
355 struct r600_shader_tgsi_instruction {
356 	unsigned	op;
357 	int (*process)(struct r600_shader_ctx *ctx);
358 };
359 
360 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
361 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
362 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
363 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
364 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
365 static int tgsi_else(struct r600_shader_ctx *ctx);
366 static int tgsi_endif(struct r600_shader_ctx *ctx);
367 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
368 static int tgsi_endloop(struct r600_shader_ctx *ctx);
369 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
370 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
371                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
372                                 unsigned int dst_reg);
373 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
374 			const struct r600_shader_src *shader_src,
375 			unsigned chan);
376 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
377 			       unsigned dst_reg);
378 
tgsi_last_instruction(unsigned writemask)379 static int tgsi_last_instruction(unsigned writemask)
380 {
381 	int i, lasti = 0;
382 
383 	for (i = 0; i < 4; i++) {
384 		if (writemask & (1 << i)) {
385 			lasti = i;
386 		}
387 	}
388 	return lasti;
389 }
390 
tgsi_is_supported(struct r600_shader_ctx * ctx)391 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
392 {
393 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
394 	unsigned j;
395 
396 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
397 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
398 		return -EINVAL;
399 	}
400 	if (i->Instruction.Predicate) {
401 		R600_ERR("predicate unsupported\n");
402 		return -EINVAL;
403 	}
404 #if 0
405 	if (i->Instruction.Label) {
406 		R600_ERR("label unsupported\n");
407 		return -EINVAL;
408 	}
409 #endif
410 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
411 		if (i->Src[j].Register.Dimension) {
412 		   switch (i->Src[j].Register.File) {
413 		   case TGSI_FILE_CONSTANT:
414 			   break;
415 		   case TGSI_FILE_INPUT:
416 			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
417 			       ctx->type == PIPE_SHADER_TESS_CTRL ||
418 			       ctx->type == PIPE_SHADER_TESS_EVAL)
419 				   break;
420 		   case TGSI_FILE_OUTPUT:
421 			   if (ctx->type == PIPE_SHADER_TESS_CTRL)
422 				   break;
423 		   default:
424 			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
425 				    i->Src[j].Register.File,
426 				    i->Src[j].Register.Dimension);
427 			   return -EINVAL;
428 		   }
429 		}
430 	}
431 	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
432 		if (i->Dst[j].Register.Dimension) {
433 			if (ctx->type == PIPE_SHADER_TESS_CTRL)
434 				continue;
435 			R600_ERR("unsupported dst (dimension)\n");
436 			return -EINVAL;
437 		}
438 	}
439 	return 0;
440 }
441 
eg_get_interpolator_index(unsigned interpolate,unsigned location)442 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
443 {
444 	if (interpolate == TGSI_INTERPOLATE_COLOR ||
445 		interpolate == TGSI_INTERPOLATE_LINEAR ||
446 		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
447 	{
448 		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
449 		int loc;
450 
451 		switch(location) {
452 		case TGSI_INTERPOLATE_LOC_CENTER:
453 			loc = 1;
454 			break;
455 		case TGSI_INTERPOLATE_LOC_CENTROID:
456 			loc = 2;
457 			break;
458 		case TGSI_INTERPOLATE_LOC_SAMPLE:
459 		default:
460 			loc = 0; break;
461 		}
462 
463 		return is_linear * 3 + loc;
464 	}
465 
466 	return -1;
467 }
468 
evergreen_interp_assign_ij_index(struct r600_shader_ctx * ctx,int input)469 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
470 		int input)
471 {
472 	int i = eg_get_interpolator_index(
473 		ctx->shader->input[input].interpolate,
474 		ctx->shader->input[input].interpolate_location);
475 	assert(i >= 0);
476 	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
477 }
478 
evergreen_interp_alu(struct r600_shader_ctx * ctx,int input)479 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
480 {
481 	int i, r;
482 	struct r600_bytecode_alu alu;
483 	int gpr = 0, base_chan = 0;
484 	int ij_index = ctx->shader->input[input].ij_index;
485 
486 	/* work out gpr and base_chan from index */
487 	gpr = ij_index / 2;
488 	base_chan = (2 * (ij_index % 2)) + 1;
489 
490 	for (i = 0; i < 8; i++) {
491 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
492 
493 		if (i < 4)
494 			alu.op = ALU_OP2_INTERP_ZW;
495 		else
496 			alu.op = ALU_OP2_INTERP_XY;
497 
498 		if ((i > 1) && (i < 6)) {
499 			alu.dst.sel = ctx->shader->input[input].gpr;
500 			alu.dst.write = 1;
501 		}
502 
503 		alu.dst.chan = i % 4;
504 
505 		alu.src[0].sel = gpr;
506 		alu.src[0].chan = (base_chan - (i % 2));
507 
508 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
509 
510 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
511 		if ((i % 4) == 3)
512 			alu.last = 1;
513 		r = r600_bytecode_add_alu(ctx->bc, &alu);
514 		if (r)
515 			return r;
516 	}
517 	return 0;
518 }
519 
evergreen_interp_flat(struct r600_shader_ctx * ctx,int input)520 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
521 {
522 	int i, r;
523 	struct r600_bytecode_alu alu;
524 
525 	for (i = 0; i < 4; i++) {
526 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
527 
528 		alu.op = ALU_OP1_INTERP_LOAD_P0;
529 
530 		alu.dst.sel = ctx->shader->input[input].gpr;
531 		alu.dst.write = 1;
532 
533 		alu.dst.chan = i;
534 
535 		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
536 		alu.src[0].chan = i;
537 
538 		if (i == 3)
539 			alu.last = 1;
540 		r = r600_bytecode_add_alu(ctx->bc, &alu);
541 		if (r)
542 			return r;
543 	}
544 	return 0;
545 }
546 
547 /*
548  * Special export handling in shaders
549  *
550  * shader export ARRAY_BASE for EXPORT_POS:
551  * 60 is position
552  * 61 is misc vector
553  * 62, 63 are clip distance vectors
554  *
555  * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
556  * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
557  * USE_VTX_POINT_SIZE - point size in the X channel of export 61
558  * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
559  * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
560  * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
561  * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
562  * exclusive from render target index)
563  * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
564  *
565  *
566  * shader export ARRAY_BASE for EXPORT_PIXEL:
567  * 0-7 CB targets
568  * 61 computed Z vector
569  *
570  * The use of the values exported in the computed Z vector are controlled
571  * by DB_SHADER_CONTROL:
572  * Z_EXPORT_ENABLE - Z as a float in RED
573  * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
574  * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
575  * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
576  * DB_SOURCE_FORMAT - export control restrictions
577  *
578  */
579 
580 
581 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
r600_spi_sid(struct r600_shader_io * io)582 static int r600_spi_sid(struct r600_shader_io * io)
583 {
584 	int index, name = io->name;
585 
586 	/* These params are handled differently, they don't need
587 	 * semantic indices, so we'll use 0 for them.
588 	 */
589 	if (name == TGSI_SEMANTIC_POSITION ||
590 	    name == TGSI_SEMANTIC_PSIZE ||
591 	    name == TGSI_SEMANTIC_EDGEFLAG ||
592 	    name == TGSI_SEMANTIC_FACE ||
593 	    name == TGSI_SEMANTIC_SAMPLEMASK)
594 		index = 0;
595 	else {
596 		if (name == TGSI_SEMANTIC_GENERIC) {
597 			/* For generic params simply use sid from tgsi */
598 			index = io->sid;
599 		} else {
600 			/* For non-generic params - pack name and sid into 8 bits */
601 			index = 0x80 | (name<<3) | (io->sid);
602 		}
603 
604 		/* Make sure that all really used indices have nonzero value, so
605 		 * we can just compare it to 0 later instead of comparing the name
606 		 * with different values to detect special cases. */
607 		index++;
608 	}
609 
610 	return index;
611 };
612 
613 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)614 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
615 {
616 	switch (semantic_name) {
617 	case TGSI_SEMANTIC_POSITION:
618 		return 0;
619 	case TGSI_SEMANTIC_PSIZE:
620 		return 1;
621 	case TGSI_SEMANTIC_CLIPDIST:
622 		assert(index <= 1);
623 		return 2 + index;
624 	case TGSI_SEMANTIC_GENERIC:
625 		if (index <= 63-4)
626 			return 4 + index - 9;
627 		else
628 			/* same explanation as in the default statement,
629 			 * the only user hitting this is st/nine.
630 			 */
631 			return 0;
632 
633 	/* patch indices are completely separate and thus start from 0 */
634 	case TGSI_SEMANTIC_TESSOUTER:
635 		return 0;
636 	case TGSI_SEMANTIC_TESSINNER:
637 		return 1;
638 	case TGSI_SEMANTIC_PATCH:
639 		return 2 + index;
640 
641 	default:
642 		/* Don't fail here. The result of this function is only used
643 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
644 		 * occur, but this function is called for all vertex shaders
645 		 * before it's known whether LS will be compiled or not.
646 		 */
647 		return 0;
648 	}
649 }
650 
651 /* turn input into interpolate on EG */
evergreen_interp_input(struct r600_shader_ctx * ctx,int index)652 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
653 {
654 	int r = 0;
655 
656 	if (ctx->shader->input[index].spi_sid) {
657 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
658 		if (ctx->shader->input[index].interpolate > 0) {
659 			evergreen_interp_assign_ij_index(ctx, index);
660 			r = evergreen_interp_alu(ctx, index);
661 		} else {
662 			r = evergreen_interp_flat(ctx, index);
663 		}
664 	}
665 	return r;
666 }
667 
select_twoside_color(struct r600_shader_ctx * ctx,int front,int back)668 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
669 {
670 	struct r600_bytecode_alu alu;
671 	int i, r;
672 	int gpr_front = ctx->shader->input[front].gpr;
673 	int gpr_back = ctx->shader->input[back].gpr;
674 
675 	for (i = 0; i < 4; i++) {
676 		memset(&alu, 0, sizeof(alu));
677 		alu.op = ALU_OP3_CNDGT;
678 		alu.is_op3 = 1;
679 		alu.dst.write = 1;
680 		alu.dst.sel = gpr_front;
681 		alu.src[0].sel = ctx->face_gpr;
682 		alu.src[1].sel = gpr_front;
683 		alu.src[2].sel = gpr_back;
684 
685 		alu.dst.chan = i;
686 		alu.src[1].chan = i;
687 		alu.src[2].chan = i;
688 		alu.last = (i==3);
689 
690 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
691 			return r;
692 	}
693 
694 	return 0;
695 }
696 
697 /* execute a single slot ALU calculation */
single_alu_op2(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val)698 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
699 			  int dst_sel, int dst_chan,
700 			  int src0_sel, unsigned src0_chan_val,
701 			  int src1_sel, unsigned src1_chan_val)
702 {
703 	struct r600_bytecode_alu alu;
704 	int r, i;
705 
706 	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
707 		for (i = 0; i < 4; i++) {
708 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
709 			alu.op = op;
710 			alu.src[0].sel = src0_sel;
711 			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
712 				alu.src[0].value = src0_chan_val;
713 			else
714 				alu.src[0].chan = src0_chan_val;
715 			alu.src[1].sel = src1_sel;
716 			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
717 				alu.src[1].value = src1_chan_val;
718 			else
719 				alu.src[1].chan = src1_chan_val;
720 			alu.dst.sel = dst_sel;
721 			alu.dst.chan = i;
722 			alu.dst.write = i == dst_chan;
723 			alu.last = (i == 3);
724 			r = r600_bytecode_add_alu(ctx->bc, &alu);
725 			if (r)
726 				return r;
727 		}
728 		return 0;
729 	}
730 
731 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
732 	alu.op = op;
733 	alu.src[0].sel = src0_sel;
734 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
735 		alu.src[0].value = src0_chan_val;
736 	else
737 		alu.src[0].chan = src0_chan_val;
738 	alu.src[1].sel = src1_sel;
739 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
740 		alu.src[1].value = src1_chan_val;
741 	else
742 		alu.src[1].chan = src1_chan_val;
743 	alu.dst.sel = dst_sel;
744 	alu.dst.chan = dst_chan;
745 	alu.dst.write = 1;
746 	alu.last = 1;
747 	r = r600_bytecode_add_alu(ctx->bc, &alu);
748 	if (r)
749 		return r;
750 	return 0;
751 }
752 
753 /* execute a single slot ALU calculation */
single_alu_op3(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val,int src2_sel,unsigned src2_chan_val)754 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
755 			  int dst_sel, int dst_chan,
756 			  int src0_sel, unsigned src0_chan_val,
757 			  int src1_sel, unsigned src1_chan_val,
758 			  int src2_sel, unsigned src2_chan_val)
759 {
760 	struct r600_bytecode_alu alu;
761 	int r;
762 
763 	/* validate this for other ops */
764 	assert(op == ALU_OP3_MULADD_UINT24);
765 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
766 	alu.op = op;
767 	alu.src[0].sel = src0_sel;
768 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
769 		alu.src[0].value = src0_chan_val;
770 	else
771 		alu.src[0].chan = src0_chan_val;
772 	alu.src[1].sel = src1_sel;
773 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
774 		alu.src[1].value = src1_chan_val;
775 	else
776 		alu.src[1].chan = src1_chan_val;
777 	alu.src[2].sel = src2_sel;
778 	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
779 		alu.src[2].value = src2_chan_val;
780 	else
781 		alu.src[2].chan = src2_chan_val;
782 	alu.dst.sel = dst_sel;
783 	alu.dst.chan = dst_chan;
784 	alu.is_op3 = 1;
785 	alu.last = 1;
786 	r = r600_bytecode_add_alu(ctx->bc, &alu);
787 	if (r)
788 		return r;
789 	return 0;
790 }
791 
792 /* put it in temp_reg.x */
get_lds_offset0(struct r600_shader_ctx * ctx,int rel_patch_chan,int temp_reg,bool is_patch_var)793 static int get_lds_offset0(struct r600_shader_ctx *ctx,
794 			   int rel_patch_chan,
795 			   int temp_reg, bool is_patch_var)
796 {
797 	int r;
798 
799 	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
800 	/* ADD
801 	   Dimension - patch0_offset (input_vals.z),
802 	   Non-dim - patch0_data_offset (input_vals.w)
803 	*/
804 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
805 			   temp_reg, 0,
806 			   ctx->tess_output_info, 0,
807 			   0, rel_patch_chan,
808 			   ctx->tess_output_info, is_patch_var ? 3 : 2);
809 	if (r)
810 		return r;
811 	return 0;
812 }
813 
get_address_file_reg(struct r600_shader_ctx * ctx,int index)814 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
815 {
816 	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
817 }
818 
r600_get_temp(struct r600_shader_ctx * ctx)819 static int r600_get_temp(struct r600_shader_ctx *ctx)
820 {
821 	return ctx->temp_reg + ctx->max_driver_temp_used++;
822 }
823 
vs_add_primid_output(struct r600_shader_ctx * ctx,int prim_id_sid)824 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
825 {
826 	int i;
827 	i = ctx->shader->noutput++;
828 	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
829 	ctx->shader->output[i].sid = 0;
830 	ctx->shader->output[i].gpr = 0;
831 	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
832 	ctx->shader->output[i].write_mask = 0x4;
833 	ctx->shader->output[i].spi_sid = prim_id_sid;
834 
835 	return 0;
836 }
837 
tgsi_barrier(struct r600_shader_ctx * ctx)838 static int tgsi_barrier(struct r600_shader_ctx *ctx)
839 {
840 	struct r600_bytecode_alu alu;
841 	int r;
842 
843 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
844 	alu.op = ctx->inst_info->op;
845 	alu.last = 1;
846 
847 	r = r600_bytecode_add_alu(ctx->bc, &alu);
848 	if (r)
849 		return r;
850 	return 0;
851 }
852 
tgsi_declaration(struct r600_shader_ctx * ctx)853 static int tgsi_declaration(struct r600_shader_ctx *ctx)
854 {
855 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
856 	int r, i, j, count = d->Range.Last - d->Range.First + 1;
857 
858 	switch (d->Declaration.File) {
859 	case TGSI_FILE_INPUT:
860 		for (j = 0; j < count; j++) {
861 			i = ctx->shader->ninput + j;
862 			assert(i < ARRAY_SIZE(ctx->shader->input));
863 			ctx->shader->input[i].name = d->Semantic.Name;
864 			ctx->shader->input[i].sid = d->Semantic.Index + j;
865 			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
866 			ctx->shader->input[i].interpolate_location = d->Interp.Location;
867 			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
868 			if (ctx->type == PIPE_SHADER_FRAGMENT) {
869 				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
870 				switch (ctx->shader->input[i].name) {
871 				case TGSI_SEMANTIC_FACE:
872 					if (ctx->face_gpr != -1)
873 						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
874 					else
875 						ctx->face_gpr = ctx->shader->input[i].gpr;
876 					break;
877 				case TGSI_SEMANTIC_COLOR:
878 					ctx->colors_used++;
879 					break;
880 				case TGSI_SEMANTIC_POSITION:
881 					ctx->fragcoord_input = i;
882 					break;
883 				case TGSI_SEMANTIC_PRIMID:
884 					/* set this for now */
885 					ctx->shader->gs_prim_id_input = true;
886 					ctx->shader->ps_prim_id_input = i;
887 					break;
888 				}
889 				if (ctx->bc->chip_class >= EVERGREEN) {
890 					if ((r = evergreen_interp_input(ctx, i)))
891 						return r;
892 				}
893 			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
894 				/* FIXME probably skip inputs if they aren't passed in the ring */
895 				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
896 				ctx->next_ring_offset += 16;
897 				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
898 					ctx->shader->gs_prim_id_input = true;
899 			}
900 		}
901 		ctx->shader->ninput += count;
902 		break;
903 	case TGSI_FILE_OUTPUT:
904 		for (j = 0; j < count; j++) {
905 			i = ctx->shader->noutput + j;
906 			assert(i < ARRAY_SIZE(ctx->shader->output));
907 			ctx->shader->output[i].name = d->Semantic.Name;
908 			ctx->shader->output[i].sid = d->Semantic.Index + j;
909 			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
910 			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
911 			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
912 			if (ctx->type == PIPE_SHADER_VERTEX ||
913 			    ctx->type == PIPE_SHADER_GEOMETRY ||
914 			    ctx->type == PIPE_SHADER_TESS_EVAL) {
915 				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
916 				switch (d->Semantic.Name) {
917 				case TGSI_SEMANTIC_CLIPDIST:
918 					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
919 									((d->Semantic.Index + j) << 2);
920 					break;
921 				case TGSI_SEMANTIC_PSIZE:
922 					ctx->shader->vs_out_misc_write = 1;
923 					ctx->shader->vs_out_point_size = 1;
924 					break;
925 				case TGSI_SEMANTIC_EDGEFLAG:
926 					ctx->shader->vs_out_misc_write = 1;
927 					ctx->shader->vs_out_edgeflag = 1;
928 					ctx->edgeflag_output = i;
929 					break;
930 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
931 					ctx->shader->vs_out_misc_write = 1;
932 					ctx->shader->vs_out_viewport = 1;
933 					break;
934 				case TGSI_SEMANTIC_LAYER:
935 					ctx->shader->vs_out_misc_write = 1;
936 					ctx->shader->vs_out_layer = 1;
937 					break;
938 				case TGSI_SEMANTIC_CLIPVERTEX:
939 					ctx->clip_vertex_write = TRUE;
940 					ctx->cv_output = i;
941 					break;
942 				}
943 				if (ctx->type == PIPE_SHADER_GEOMETRY) {
944 					ctx->gs_out_ring_offset += 16;
945 				}
946 			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
947 				switch (d->Semantic.Name) {
948 				case TGSI_SEMANTIC_COLOR:
949 					ctx->shader->nr_ps_max_color_exports++;
950 					break;
951 				}
952 			}
953 		}
954 		ctx->shader->noutput += count;
955 		break;
956 	case TGSI_FILE_TEMPORARY:
957 		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
958 			if (d->Array.ArrayID) {
959 				r600_add_gpr_array(ctx->shader,
960 				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
961 								   d->Range.First,
962 				               d->Range.Last - d->Range.First + 1, 0x0F);
963 			}
964 		}
965 		break;
966 
967 	case TGSI_FILE_CONSTANT:
968 	case TGSI_FILE_SAMPLER:
969 	case TGSI_FILE_SAMPLER_VIEW:
970 	case TGSI_FILE_ADDRESS:
971 		break;
972 
973 	case TGSI_FILE_SYSTEM_VALUE:
974 		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
975 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
976 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
977 			break; /* Already handled from allocate_system_value_inputs */
978 		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
979 			if (!ctx->native_integers) {
980 				struct r600_bytecode_alu alu;
981 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
982 
983 				alu.op = ALU_OP1_INT_TO_FLT;
984 				alu.src[0].sel = 0;
985 				alu.src[0].chan = 3;
986 
987 				alu.dst.sel = 0;
988 				alu.dst.chan = 3;
989 				alu.dst.write = 1;
990 				alu.last = 1;
991 
992 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
993 					return r;
994 			}
995 			break;
996 		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
997 			break;
998 		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
999 			break;
1000 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1001 			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1002 			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1003 			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1004 			unsigned temp_reg = r600_get_temp(ctx);
1005 
1006 			r = get_lds_offset0(ctx, 2, temp_reg, true);
1007 			if (r)
1008 				return r;
1009 
1010 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1011 					   temp_reg, 0,
1012 					   temp_reg, 0,
1013 					   V_SQ_ALU_SRC_LITERAL, param * 16);
1014 			if (r)
1015 				return r;
1016 
1017 			do_lds_fetch_values(ctx, temp_reg, dreg);
1018 		}
1019 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1020 			/* MOV r1.x, r0.x;
1021 			   MOV r1.y, r0.y;
1022 			*/
1023 			for (i = 0; i < 2; i++) {
1024 				struct r600_bytecode_alu alu;
1025 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1026 				alu.op = ALU_OP1_MOV;
1027 				alu.src[0].sel = 0;
1028 				alu.src[0].chan = 0 + i;
1029 				alu.dst.sel = 1;
1030 				alu.dst.chan = 0 + i;
1031 				alu.dst.write = 1;
1032 				alu.last = (i == 1) ? 1 : 0;
1033 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1034 					return r;
1035 			}
1036 			/* ADD r1.z, 1.0f, -r0.x */
1037 			struct r600_bytecode_alu alu;
1038 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1039 			alu.op = ALU_OP2_ADD;
1040 			alu.src[0].sel = V_SQ_ALU_SRC_1;
1041 			alu.src[1].sel = 1;
1042 			alu.src[1].chan = 0;
1043 			alu.src[1].neg = 1;
1044 			alu.dst.sel = 1;
1045 			alu.dst.chan = 2;
1046 			alu.dst.write = 1;
1047 			alu.last = 1;
1048 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1049 				return r;
1050 
1051 			/* ADD r1.z, r1.z, -r1.y */
1052 			alu.op = ALU_OP2_ADD;
1053 			alu.src[0].sel = 1;
1054 			alu.src[0].chan = 2;
1055 			alu.src[1].sel = 1;
1056 			alu.src[1].chan = 1;
1057 			alu.src[1].neg = 1;
1058 			alu.dst.sel = 1;
1059 			alu.dst.chan = 2;
1060 			alu.dst.write = 1;
1061 			alu.last = 1;
1062 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1063 				return r;
1064 			break;
1065 		}
1066 		break;
1067 	default:
1068 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1069 		return -EINVAL;
1070 	}
1071 	return 0;
1072 }
1073 
allocate_system_value_inputs(struct r600_shader_ctx * ctx,int gpr_offset)1074 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1075 {
1076 	struct tgsi_parse_context parse;
1077 	struct {
1078 		boolean enabled;
1079 		int *reg;
1080 		unsigned name, alternate_name;
1081 	} inputs[2] = {
1082 		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1083 
1084 		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1085 	};
1086 	int i, k, num_regs = 0;
1087 
1088 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1089 		return 0;
1090 	}
1091 
1092 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1093 	while (!tgsi_parse_end_of_tokens(&parse)) {
1094 		tgsi_parse_token(&parse);
1095 
1096 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1097 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1098 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1099 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1100 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1101 			{
1102 				int interpolate, location, k;
1103 
1104 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1105 					location = TGSI_INTERPOLATE_LOC_CENTER;
1106 					inputs[1].enabled = true; /* needs SAMPLEID */
1107 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1108 					location = TGSI_INTERPOLATE_LOC_CENTER;
1109 					/* Needs sample positions, currently those are always available */
1110 				} else {
1111 					location = TGSI_INTERPOLATE_LOC_CENTROID;
1112 				}
1113 
1114 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1115 				k = eg_get_interpolator_index(interpolate, location);
1116 				ctx->eg_interpolators[k].enabled = true;
1117 			}
1118 		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1119 			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1120 			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1121 				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1122 					if (d->Semantic.Name == inputs[k].name ||
1123 						d->Semantic.Name == inputs[k].alternate_name) {
1124 						inputs[k].enabled = true;
1125 					}
1126 				}
1127 			}
1128 		}
1129 	}
1130 
1131 	tgsi_parse_free(&parse);
1132 
1133 	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1134 		boolean enabled = inputs[i].enabled;
1135 		int *reg = inputs[i].reg;
1136 		unsigned name = inputs[i].name;
1137 
1138 		if (enabled) {
1139 			int gpr = gpr_offset + num_regs++;
1140 
1141 			// add to inputs, allocate a gpr
1142 			k = ctx->shader->ninput ++;
1143 			ctx->shader->input[k].name = name;
1144 			ctx->shader->input[k].sid = 0;
1145 			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1146 			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1147 			*reg = ctx->shader->input[k].gpr = gpr;
1148 		}
1149 	}
1150 
1151 	return gpr_offset + num_regs;
1152 }
1153 
1154 /*
1155  * for evergreen we need to scan the shader to find the number of GPRs we need to
1156  * reserve for interpolation and system values
1157  *
1158  * we need to know if we are going to emit
1159  * any sample or centroid inputs
1160  * if perspective and linear are required
1161 */
evergreen_gpr_count(struct r600_shader_ctx * ctx)1162 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1163 {
1164 	unsigned i;
1165 	int num_baryc;
1166 	struct tgsi_parse_context parse;
1167 
1168 	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1169 
1170 	for (i = 0; i < ctx->info.num_inputs; i++) {
1171 		int k;
1172 		/* skip position/face/mask/sampleid */
1173 		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1174 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1175 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1176 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1177 			continue;
1178 
1179 		k = eg_get_interpolator_index(
1180 			ctx->info.input_interpolate[i],
1181 			ctx->info.input_interpolate_loc[i]);
1182 		if (k >= 0)
1183 			ctx->eg_interpolators[k].enabled = TRUE;
1184 	}
1185 
1186 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1187 		return 0;
1188 	}
1189 
1190 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1191 	while (!tgsi_parse_end_of_tokens(&parse)) {
1192 		tgsi_parse_token(&parse);
1193 
1194 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1195 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1196 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1197 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1198 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1199 			{
1200 				int interpolate, location, k;
1201 
1202 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1203 					location = TGSI_INTERPOLATE_LOC_CENTER;
1204 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1205 					location = TGSI_INTERPOLATE_LOC_CENTER;
1206 				} else {
1207 					location = TGSI_INTERPOLATE_LOC_CENTROID;
1208 				}
1209 
1210 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1211 				k = eg_get_interpolator_index(interpolate, location);
1212 				ctx->eg_interpolators[k].enabled = true;
1213 			}
1214 		}
1215 	}
1216 
1217 	tgsi_parse_free(&parse);
1218 
1219 	/* assign gpr to each interpolator according to priority */
1220 	num_baryc = 0;
1221 	for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1222 		if (ctx->eg_interpolators[i].enabled) {
1223 			ctx->eg_interpolators[i].ij_index = num_baryc;
1224 			num_baryc ++;
1225 		}
1226 	}
1227 
1228 	/* XXX PULL MODEL and LINE STIPPLE */
1229 
1230 	num_baryc = (num_baryc + 1) >> 1;
1231 	return allocate_system_value_inputs(ctx, num_baryc);
1232 }
1233 
1234 /* sample_id_sel == NULL means fetch for current sample */
load_sample_position(struct r600_shader_ctx * ctx,struct r600_shader_src * sample_id,int chan_sel)1235 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1236 {
1237 	struct r600_bytecode_vtx vtx;
1238 	int r, t1;
1239 
1240 	assert(ctx->fixed_pt_position_gpr != -1);
1241 
1242 	t1 = r600_get_temp(ctx);
1243 
1244 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1245 	vtx.op = FETCH_OP_VFETCH;
1246 	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1247 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1248 	if (sample_id == NULL) {
1249 		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1250 		vtx.src_sel_x = 3;
1251 	}
1252 	else {
1253 		struct r600_bytecode_alu alu;
1254 
1255 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1256 		alu.op = ALU_OP1_MOV;
1257 		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1258 		alu.dst.sel = t1;
1259 		alu.dst.write = 1;
1260 		alu.last = 1;
1261 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1262 		if (r)
1263 			return r;
1264 
1265 		vtx.src_gpr = t1;
1266 		vtx.src_sel_x = 0;
1267 	}
1268 	vtx.mega_fetch_count = 16;
1269 	vtx.dst_gpr = t1;
1270 	vtx.dst_sel_x = 0;
1271 	vtx.dst_sel_y = 1;
1272 	vtx.dst_sel_z = 2;
1273 	vtx.dst_sel_w = 3;
1274 	vtx.data_format = FMT_32_32_32_32_FLOAT;
1275 	vtx.num_format_all = 2;
1276 	vtx.format_comp_all = 1;
1277 	vtx.use_const_fields = 0;
1278 	vtx.offset = 1; // first element is size of buffer
1279 	vtx.endian = r600_endian_swap(32);
1280 	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1281 
1282 	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1283 	if (r)
1284 		return r;
1285 
1286 	return t1;
1287 }
1288 
tgsi_src(struct r600_shader_ctx * ctx,const struct tgsi_full_src_register * tgsi_src,struct r600_shader_src * r600_src)1289 static void tgsi_src(struct r600_shader_ctx *ctx,
1290 		     const struct tgsi_full_src_register *tgsi_src,
1291 		     struct r600_shader_src *r600_src)
1292 {
1293 	memset(r600_src, 0, sizeof(*r600_src));
1294 	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1295 	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1296 	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1297 	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1298 	r600_src->neg = tgsi_src->Register.Negate;
1299 	r600_src->abs = tgsi_src->Register.Absolute;
1300 
1301 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1302 		int index;
1303 		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1304 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1305 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1306 
1307 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1308 			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1309 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1310 				return;
1311 		}
1312 		index = tgsi_src->Register.Index;
1313 		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1314 		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1315 	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1316 		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1317 			r600_src->swizzle[0] = 2; // Z value
1318 			r600_src->swizzle[1] = 2;
1319 			r600_src->swizzle[2] = 2;
1320 			r600_src->swizzle[3] = 2;
1321 			r600_src->sel = ctx->face_gpr;
1322 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1323 			r600_src->swizzle[0] = 3; // W value
1324 			r600_src->swizzle[1] = 3;
1325 			r600_src->swizzle[2] = 3;
1326 			r600_src->swizzle[3] = 3;
1327 			r600_src->sel = ctx->fixed_pt_position_gpr;
1328 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1329 			r600_src->swizzle[0] = 0;
1330 			r600_src->swizzle[1] = 1;
1331 			r600_src->swizzle[2] = 4;
1332 			r600_src->swizzle[3] = 4;
1333 			r600_src->sel = load_sample_position(ctx, NULL, -1);
1334 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1335 			r600_src->swizzle[0] = 3;
1336 			r600_src->swizzle[1] = 3;
1337 			r600_src->swizzle[2] = 3;
1338 			r600_src->swizzle[3] = 3;
1339 			r600_src->sel = 0;
1340 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1341 			r600_src->swizzle[0] = 0;
1342 			r600_src->swizzle[1] = 0;
1343 			r600_src->swizzle[2] = 0;
1344 			r600_src->swizzle[3] = 0;
1345 			r600_src->sel = 0;
1346 		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1347 			r600_src->swizzle[0] = 3;
1348 			r600_src->swizzle[1] = 3;
1349 			r600_src->swizzle[2] = 3;
1350 			r600_src->swizzle[3] = 3;
1351 			r600_src->sel = 1;
1352 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1353 			r600_src->swizzle[0] = 2;
1354 			r600_src->swizzle[1] = 2;
1355 			r600_src->swizzle[2] = 2;
1356 			r600_src->swizzle[3] = 2;
1357 			r600_src->sel = 0;
1358 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1359 			r600_src->sel = 1;
1360 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1361 			r600_src->sel = 3;
1362 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1363 			r600_src->sel = 2;
1364 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1365 			if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1366 				r600_src->sel = ctx->tess_input_info;
1367 				r600_src->swizzle[0] = 2;
1368 				r600_src->swizzle[1] = 2;
1369 				r600_src->swizzle[2] = 2;
1370 				r600_src->swizzle[3] = 2;
1371 			} else {
1372 				r600_src->sel = ctx->tess_input_info;
1373 				r600_src->swizzle[0] = 3;
1374 				r600_src->swizzle[1] = 3;
1375 				r600_src->swizzle[2] = 3;
1376 				r600_src->swizzle[3] = 3;
1377 			}
1378 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1379 			r600_src->sel = 0;
1380 			r600_src->swizzle[0] = 0;
1381 			r600_src->swizzle[1] = 0;
1382 			r600_src->swizzle[2] = 0;
1383 			r600_src->swizzle[3] = 0;
1384 		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1385 			r600_src->sel = 0;
1386 			r600_src->swizzle[0] = 3;
1387 			r600_src->swizzle[1] = 3;
1388 			r600_src->swizzle[2] = 3;
1389 			r600_src->swizzle[3] = 3;
1390 		}
1391 	} else {
1392 		if (tgsi_src->Register.Indirect)
1393 			r600_src->rel = V_SQ_REL_RELATIVE;
1394 		r600_src->sel = tgsi_src->Register.Index;
1395 		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1396 	}
1397 	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1398 		if (tgsi_src->Register.Dimension) {
1399 			r600_src->kc_bank = tgsi_src->Dimension.Index;
1400 			if (tgsi_src->Dimension.Indirect) {
1401 				r600_src->kc_rel = 1;
1402 			}
1403 		}
1404 	}
1405 }
1406 
tgsi_fetch_rel_const(struct r600_shader_ctx * ctx,unsigned int cb_idx,unsigned cb_rel,unsigned int offset,unsigned ar_chan,unsigned int dst_reg)1407 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1408                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1409                                 unsigned int dst_reg)
1410 {
1411 	struct r600_bytecode_vtx vtx;
1412 	unsigned int ar_reg;
1413 	int r;
1414 
1415 	if (offset) {
1416 		struct r600_bytecode_alu alu;
1417 
1418 		memset(&alu, 0, sizeof(alu));
1419 
1420 		alu.op = ALU_OP2_ADD_INT;
1421 		alu.src[0].sel = ctx->bc->ar_reg;
1422 		alu.src[0].chan = ar_chan;
1423 
1424 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1425 		alu.src[1].value = offset;
1426 
1427 		alu.dst.sel = dst_reg;
1428 		alu.dst.chan = ar_chan;
1429 		alu.dst.write = 1;
1430 		alu.last = 1;
1431 
1432 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1433 			return r;
1434 
1435 		ar_reg = dst_reg;
1436 	} else {
1437 		ar_reg = ctx->bc->ar_reg;
1438 	}
1439 
1440 	memset(&vtx, 0, sizeof(vtx));
1441 	vtx.buffer_id = cb_idx;
1442 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1443 	vtx.src_gpr = ar_reg;
1444 	vtx.src_sel_x = ar_chan;
1445 	vtx.mega_fetch_count = 16;
1446 	vtx.dst_gpr = dst_reg;
1447 	vtx.dst_sel_x = 0;		/* SEL_X */
1448 	vtx.dst_sel_y = 1;		/* SEL_Y */
1449 	vtx.dst_sel_z = 2;		/* SEL_Z */
1450 	vtx.dst_sel_w = 3;		/* SEL_W */
1451 	vtx.data_format = FMT_32_32_32_32_FLOAT;
1452 	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1453 	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1454 	vtx.endian = r600_endian_swap(32);
1455 	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1456 
1457 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1458 		return r;
1459 
1460 	return 0;
1461 }
1462 
fetch_gs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1463 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1464 {
1465 	struct r600_bytecode_vtx vtx;
1466 	int r;
1467 	unsigned index = src->Register.Index;
1468 	unsigned vtx_id = src->Dimension.Index;
1469 	int offset_reg = vtx_id / 3;
1470 	int offset_chan = vtx_id % 3;
1471 	int t2 = 0;
1472 
1473 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1474 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1475 
1476 	if (offset_reg == 0 && offset_chan == 2)
1477 		offset_chan = 3;
1478 
1479 	if (src->Dimension.Indirect || src->Register.Indirect)
1480 		t2 = r600_get_temp(ctx);
1481 
1482 	if (src->Dimension.Indirect) {
1483 		int treg[3];
1484 		struct r600_bytecode_alu alu;
1485 		int r, i;
1486 		unsigned addr_reg;
1487 		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1488 		if (src->DimIndirect.Index > 0) {
1489 			r = single_alu_op2(ctx, ALU_OP1_MOV,
1490 					   ctx->bc->ar_reg, 0,
1491 					   addr_reg, 0,
1492 					   0, 0);
1493 			if (r)
1494 				return r;
1495 		}
1496 		/*
1497 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1498 		   at least this is what fglrx seems to do. */
1499 		for (i = 0; i < 3; i++) {
1500 			treg[i] = r600_get_temp(ctx);
1501 		}
1502 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1503 
1504 		for (i = 0; i < 3; i++) {
1505 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1506 			alu.op = ALU_OP1_MOV;
1507 			alu.src[0].sel = 0;
1508 			alu.src[0].chan = i == 2 ? 3 : i;
1509 			alu.dst.sel = treg[i];
1510 			alu.dst.chan = 0;
1511 			alu.dst.write = 1;
1512 			alu.last = 1;
1513 			r = r600_bytecode_add_alu(ctx->bc, &alu);
1514 			if (r)
1515 				return r;
1516 		}
1517 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1518 		alu.op = ALU_OP1_MOV;
1519 		alu.src[0].sel = treg[0];
1520 		alu.src[0].rel = 1;
1521 		alu.dst.sel = t2;
1522 		alu.dst.write = 1;
1523 		alu.last = 1;
1524 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1525 		if (r)
1526 			return r;
1527 		offset_reg = t2;
1528 		offset_chan = 0;
1529 	}
1530 
1531 	if (src->Register.Indirect) {
1532 		int addr_reg;
1533 		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1534 
1535 		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1536 
1537 		/* pull the value from index_reg */
1538 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1539 				   t2, 1,
1540 				   addr_reg, 0,
1541 				   V_SQ_ALU_SRC_LITERAL, first);
1542 		if (r)
1543 			return r;
1544 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1545 				   t2, 0,
1546 				   t2, 1,
1547 				   V_SQ_ALU_SRC_LITERAL, 4,
1548 				   offset_reg, offset_chan);
1549 		if (r)
1550 			return r;
1551 		offset_reg = t2;
1552 		offset_chan = 0;
1553 		index = src->Register.Index - first;
1554 	}
1555 
1556 	memset(&vtx, 0, sizeof(vtx));
1557 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1558 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1559 	vtx.src_gpr = offset_reg;
1560 	vtx.src_sel_x = offset_chan;
1561 	vtx.offset = index * 16; /*bytes*/
1562 	vtx.mega_fetch_count = 16;
1563 	vtx.dst_gpr = dst_reg;
1564 	vtx.dst_sel_x = 0;		/* SEL_X */
1565 	vtx.dst_sel_y = 1;		/* SEL_Y */
1566 	vtx.dst_sel_z = 2;		/* SEL_Z */
1567 	vtx.dst_sel_w = 3;		/* SEL_W */
1568 	if (ctx->bc->chip_class >= EVERGREEN) {
1569 		vtx.use_const_fields = 1;
1570 	} else {
1571 		vtx.data_format = FMT_32_32_32_32_FLOAT;
1572 	}
1573 
1574 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1575 		return r;
1576 
1577 	return 0;
1578 }
1579 
tgsi_split_gs_inputs(struct r600_shader_ctx * ctx)1580 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1581 {
1582 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1583 	unsigned i;
1584 
1585 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1586 		struct tgsi_full_src_register *src = &inst->Src[i];
1587 
1588 		if (src->Register.File == TGSI_FILE_INPUT) {
1589 			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1590 				/* primitive id is in R0.z */
1591 				ctx->src[i].sel = 0;
1592 				ctx->src[i].swizzle[0] = 2;
1593 			}
1594 		}
1595 		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1596 			int treg = r600_get_temp(ctx);
1597 
1598 			fetch_gs_input(ctx, src, treg);
1599 			ctx->src[i].sel = treg;
1600 			ctx->src[i].rel = 0;
1601 		}
1602 	}
1603 	return 0;
1604 }
1605 
1606 
1607 /* Tessellation shaders pass outputs to the next shader using LDS.
1608  *
1609  * LS outputs = TCS(HS) inputs
1610  * TCS(HS) outputs = TES(DS) inputs
1611  *
1612  * The LDS layout is:
1613  * - TCS inputs for patch 0
1614  * - TCS inputs for patch 1
1615  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1616  * - ...
1617  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1618  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1619  * - TCS outputs for patch 1
1620  * - Per-patch TCS outputs for patch 1
1621  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1622  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1623  * - ...
1624  *
1625  * All three shaders VS(LS), TCS, TES share the same LDS space.
1626  */
1627 /* this will return with the dw address in temp_reg.x */
r600_get_byte_address(struct r600_shader_ctx * ctx,int temp_reg,const struct tgsi_full_dst_register * dst,const struct tgsi_full_src_register * src,int stride_bytes_reg,int stride_bytes_chan)1628 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1629 				 const struct tgsi_full_dst_register *dst,
1630 				 const struct tgsi_full_src_register *src,
1631 				 int stride_bytes_reg, int stride_bytes_chan)
1632 {
1633 	struct tgsi_full_dst_register reg;
1634 	ubyte *name, *index, *array_first;
1635 	int r;
1636 	int param;
1637 	struct tgsi_shader_info *info = &ctx->info;
1638 	/* Set the register description. The address computation is the same
1639 	 * for sources and destinations. */
1640 	if (src) {
1641 		reg.Register.File = src->Register.File;
1642 		reg.Register.Index = src->Register.Index;
1643 		reg.Register.Indirect = src->Register.Indirect;
1644 		reg.Register.Dimension = src->Register.Dimension;
1645 		reg.Indirect = src->Indirect;
1646 		reg.Dimension = src->Dimension;
1647 		reg.DimIndirect = src->DimIndirect;
1648 	} else
1649 		reg = *dst;
1650 
1651 	/* If the register is 2-dimensional (e.g. an array of vertices
1652 	 * in a primitive), calculate the base address of the vertex. */
1653 	if (reg.Register.Dimension) {
1654 		int sel, chan;
1655 		if (reg.Dimension.Indirect) {
1656 			unsigned addr_reg;
1657 			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1658 
1659 			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1660 			/* pull the value from index_reg */
1661 			sel = addr_reg;
1662 			chan = 0;
1663 		} else {
1664 			sel = V_SQ_ALU_SRC_LITERAL;
1665 			chan = reg.Dimension.Index;
1666 		}
1667 
1668 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1669 				   temp_reg, 0,
1670 				   stride_bytes_reg, stride_bytes_chan,
1671 				   sel, chan,
1672 				   temp_reg, 0);
1673 		if (r)
1674 			return r;
1675 	}
1676 
1677 	if (reg.Register.File == TGSI_FILE_INPUT) {
1678 		name = info->input_semantic_name;
1679 		index = info->input_semantic_index;
1680 		array_first = info->input_array_first;
1681 	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1682 		name = info->output_semantic_name;
1683 		index = info->output_semantic_index;
1684 		array_first = info->output_array_first;
1685 	} else {
1686 		assert(0);
1687 		return -1;
1688 	}
1689 	if (reg.Register.Indirect) {
1690 		int addr_reg;
1691 		int first;
1692 		/* Add the relative address of the element. */
1693 		if (reg.Indirect.ArrayID)
1694 			first = array_first[reg.Indirect.ArrayID];
1695 		else
1696 			first = reg.Register.Index;
1697 
1698 		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1699 
1700 		/* pull the value from index_reg */
1701 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1702 				   temp_reg, 0,
1703 				   V_SQ_ALU_SRC_LITERAL, 16,
1704 				   addr_reg, 0,
1705 				   temp_reg, 0);
1706 		if (r)
1707 			return r;
1708 
1709 		param = r600_get_lds_unique_index(name[first],
1710 						  index[first]);
1711 
1712 	} else {
1713 		param = r600_get_lds_unique_index(name[reg.Register.Index],
1714 						  index[reg.Register.Index]);
1715 	}
1716 
1717 	/* add to base_addr - passed in temp_reg.x */
1718 	if (param) {
1719 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1720 				   temp_reg, 0,
1721 				   temp_reg, 0,
1722 				   V_SQ_ALU_SRC_LITERAL, param * 16);
1723 		if (r)
1724 			return r;
1725 
1726 	}
1727 	return 0;
1728 }
1729 
do_lds_fetch_values(struct r600_shader_ctx * ctx,unsigned temp_reg,unsigned dst_reg)1730 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1731 			       unsigned dst_reg)
1732 {
1733 	struct r600_bytecode_alu alu;
1734 	int r, i;
1735 
1736 	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1737 		ctx->bc->force_add_cf = 1;
1738 	for (i = 1; i < 4; i++) {
1739 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1740 				   temp_reg, i,
1741 				   temp_reg, 0,
1742 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
1743 		if (r)
1744 			return r;
1745 	}
1746 	for (i = 0; i < 4; i++) {
1747 		/* emit an LDS_READ_RET */
1748 		memset(&alu, 0, sizeof(alu));
1749 		alu.op = LDS_OP1_LDS_READ_RET;
1750 		alu.src[0].sel = temp_reg;
1751 		alu.src[0].chan = i;
1752 		alu.src[1].sel = V_SQ_ALU_SRC_0;
1753 		alu.src[2].sel = V_SQ_ALU_SRC_0;
1754 		alu.dst.chan = 0;
1755 		alu.is_lds_idx_op = true;
1756 		alu.last = 1;
1757 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1758 		if (r)
1759 			return r;
1760 	}
1761 	for (i = 0; i < 4; i++) {
1762 		/* then read from LDS_OQ_A_POP */
1763 		memset(&alu, 0, sizeof(alu));
1764 
1765 		alu.op = ALU_OP1_MOV;
1766 		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1767 		alu.src[0].chan = 0;
1768 		alu.dst.sel = dst_reg;
1769 		alu.dst.chan = i;
1770 		alu.dst.write = 1;
1771 		alu.last = 1;
1772 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1773 		if (r)
1774 			return r;
1775 	}
1776 	return 0;
1777 }
1778 
fetch_tes_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1779 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1780 {
1781 	int r;
1782 	unsigned temp_reg = r600_get_temp(ctx);
1783 
1784 	r = get_lds_offset0(ctx, 2, temp_reg,
1785 			    src->Register.Dimension ? false : true);
1786 	if (r)
1787 		return r;
1788 
1789 	/* the base address is now in temp.x */
1790 	r = r600_get_byte_address(ctx, temp_reg,
1791 				  NULL, src, ctx->tess_output_info, 1);
1792 	if (r)
1793 		return r;
1794 
1795 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1796 	if (r)
1797 		return r;
1798 	return 0;
1799 }
1800 
fetch_tcs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1801 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1802 {
1803 	int r;
1804 	unsigned temp_reg = r600_get_temp(ctx);
1805 
1806 	/* t.x = ips * r0.y */
1807 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1808 			   temp_reg, 0,
1809 			   ctx->tess_input_info, 0,
1810 			   0, 1);
1811 
1812 	if (r)
1813 		return r;
1814 
1815 	/* the base address is now in temp.x */
1816 	r = r600_get_byte_address(ctx, temp_reg,
1817 				  NULL, src, ctx->tess_input_info, 1);
1818 	if (r)
1819 		return r;
1820 
1821 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1822 	if (r)
1823 		return r;
1824 	return 0;
1825 }
1826 
fetch_tcs_output(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1827 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1828 {
1829 	int r;
1830 	unsigned temp_reg = r600_get_temp(ctx);
1831 
1832 	r = get_lds_offset0(ctx, 1, temp_reg,
1833 			    src->Register.Dimension ? false : true);
1834 	if (r)
1835 		return r;
1836 	/* the base address is now in temp.x */
1837 	r = r600_get_byte_address(ctx, temp_reg,
1838 				  NULL, src,
1839 				  ctx->tess_output_info, 1);
1840 	if (r)
1841 		return r;
1842 
1843 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1844 	if (r)
1845 		return r;
1846 	return 0;
1847 }
1848 
tgsi_split_lds_inputs(struct r600_shader_ctx * ctx)1849 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1850 {
1851 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1852 	unsigned i;
1853 
1854 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1855 		struct tgsi_full_src_register *src = &inst->Src[i];
1856 
1857 		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1858 			int treg = r600_get_temp(ctx);
1859 			fetch_tes_input(ctx, src, treg);
1860 			ctx->src[i].sel = treg;
1861 			ctx->src[i].rel = 0;
1862 		}
1863 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1864 			int treg = r600_get_temp(ctx);
1865 			fetch_tcs_input(ctx, src, treg);
1866 			ctx->src[i].sel = treg;
1867 			ctx->src[i].rel = 0;
1868 		}
1869 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1870 			int treg = r600_get_temp(ctx);
1871 			fetch_tcs_output(ctx, src, treg);
1872 			ctx->src[i].sel = treg;
1873 			ctx->src[i].rel = 0;
1874 		}
1875 	}
1876 	return 0;
1877 }
1878 
tgsi_split_constant(struct r600_shader_ctx * ctx)1879 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1880 {
1881 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1882 	struct r600_bytecode_alu alu;
1883 	int i, j, k, nconst, r;
1884 
1885 	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1886 		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1887 			nconst++;
1888 		}
1889 		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1890 	}
1891 	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1892 		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1893 			continue;
1894 		}
1895 
1896 		if (ctx->src[i].rel) {
1897 			int chan = inst->Src[i].Indirect.Swizzle;
1898 			int treg = r600_get_temp(ctx);
1899 			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1900 				return r;
1901 
1902 			ctx->src[i].kc_bank = 0;
1903 			ctx->src[i].kc_rel = 0;
1904 			ctx->src[i].sel = treg;
1905 			ctx->src[i].rel = 0;
1906 			j--;
1907 		} else if (j > 0) {
1908 			int treg = r600_get_temp(ctx);
1909 			for (k = 0; k < 4; k++) {
1910 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1911 				alu.op = ALU_OP1_MOV;
1912 				alu.src[0].sel = ctx->src[i].sel;
1913 				alu.src[0].chan = k;
1914 				alu.src[0].rel = ctx->src[i].rel;
1915 				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1916 				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1917 				alu.dst.sel = treg;
1918 				alu.dst.chan = k;
1919 				alu.dst.write = 1;
1920 				if (k == 3)
1921 					alu.last = 1;
1922 				r = r600_bytecode_add_alu(ctx->bc, &alu);
1923 				if (r)
1924 					return r;
1925 			}
1926 			ctx->src[i].sel = treg;
1927 			ctx->src[i].rel =0;
1928 			j--;
1929 		}
1930 	}
1931 	return 0;
1932 }
1933 
1934 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
tgsi_split_literal_constant(struct r600_shader_ctx * ctx)1935 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1936 {
1937 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1938 	struct r600_bytecode_alu alu;
1939 	int i, j, k, nliteral, r;
1940 
1941 	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1942 		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1943 			nliteral++;
1944 		}
1945 	}
1946 	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1947 		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1948 			int treg = r600_get_temp(ctx);
1949 			for (k = 0; k < 4; k++) {
1950 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1951 				alu.op = ALU_OP1_MOV;
1952 				alu.src[0].sel = ctx->src[i].sel;
1953 				alu.src[0].chan = k;
1954 				alu.src[0].value = ctx->src[i].value[k];
1955 				alu.dst.sel = treg;
1956 				alu.dst.chan = k;
1957 				alu.dst.write = 1;
1958 				if (k == 3)
1959 					alu.last = 1;
1960 				r = r600_bytecode_add_alu(ctx->bc, &alu);
1961 				if (r)
1962 					return r;
1963 			}
1964 			ctx->src[i].sel = treg;
1965 			j--;
1966 		}
1967 	}
1968 	return 0;
1969 }
1970 
process_twoside_color_inputs(struct r600_shader_ctx * ctx)1971 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1972 {
1973 	int i, r, count = ctx->shader->ninput;
1974 
1975 	for (i = 0; i < count; i++) {
1976 		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1977 			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1978 			if (r)
1979 				return r;
1980 		}
1981 	}
1982 	return 0;
1983 }
1984 
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size)1985 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1986 						  int stream, unsigned *stream_item_size)
1987 {
1988 	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1989 	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1990 	int i, j, r;
1991 
1992 	/* Sanity checking. */
1993 	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1994 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1995 		r = -EINVAL;
1996 		goto out_err;
1997 	}
1998 	for (i = 0; i < so->num_outputs; i++) {
1999 		if (so->output[i].output_buffer >= 4) {
2000 			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2001 				 so->output[i].output_buffer);
2002 			r = -EINVAL;
2003 			goto out_err;
2004 		}
2005 	}
2006 
2007 	/* Initialize locations where the outputs are stored. */
2008 	for (i = 0; i < so->num_outputs; i++) {
2009 
2010 		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2011 		start_comp[i] = so->output[i].start_component;
2012 		/* Lower outputs with dst_offset < start_component.
2013 		 *
2014 		 * We can only output 4D vectors with a write mask, e.g. we can
2015 		 * only output the W component at offset 3, etc. If we want
2016 		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2017 		 * to move it to X and output X. */
2018 		if (so->output[i].dst_offset < so->output[i].start_component) {
2019 			unsigned tmp = r600_get_temp(ctx);
2020 
2021 			for (j = 0; j < so->output[i].num_components; j++) {
2022 				struct r600_bytecode_alu alu;
2023 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2024 				alu.op = ALU_OP1_MOV;
2025 				alu.src[0].sel = so_gpr[i];
2026 				alu.src[0].chan = so->output[i].start_component + j;
2027 
2028 				alu.dst.sel = tmp;
2029 				alu.dst.chan = j;
2030 				alu.dst.write = 1;
2031 				if (j == so->output[i].num_components - 1)
2032 					alu.last = 1;
2033 				r = r600_bytecode_add_alu(ctx->bc, &alu);
2034 				if (r)
2035 					return r;
2036 			}
2037 			start_comp[i] = 0;
2038 			so_gpr[i] = tmp;
2039 		}
2040 	}
2041 
2042 	/* Write outputs to buffers. */
2043 	for (i = 0; i < so->num_outputs; i++) {
2044 		struct r600_bytecode_output output;
2045 
2046 		if (stream != -1 && stream != so->output[i].output_buffer)
2047 			continue;
2048 
2049 		memset(&output, 0, sizeof(struct r600_bytecode_output));
2050 		output.gpr = so_gpr[i];
2051 		output.elem_size = so->output[i].num_components - 1;
2052 		if (output.elem_size == 2)
2053 			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2054 		output.array_base = so->output[i].dst_offset - start_comp[i];
2055 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2056 		output.burst_count = 1;
2057 		/* array_size is an upper limit for the burst_count
2058 		 * with MEM_STREAM instructions */
2059 		output.array_size = 0xFFF;
2060 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2061 
2062 		if (ctx->bc->chip_class >= EVERGREEN) {
2063 			switch (so->output[i].output_buffer) {
2064 			case 0:
2065 				output.op = CF_OP_MEM_STREAM0_BUF0;
2066 				break;
2067 			case 1:
2068 				output.op = CF_OP_MEM_STREAM0_BUF1;
2069 				break;
2070 			case 2:
2071 				output.op = CF_OP_MEM_STREAM0_BUF2;
2072 				break;
2073 			case 3:
2074 				output.op = CF_OP_MEM_STREAM0_BUF3;
2075 				break;
2076 			}
2077 			output.op += so->output[i].stream * 4;
2078 			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2079 			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2080 		} else {
2081 			switch (so->output[i].output_buffer) {
2082 			case 0:
2083 				output.op = CF_OP_MEM_STREAM0;
2084 				break;
2085 			case 1:
2086 				output.op = CF_OP_MEM_STREAM1;
2087 				break;
2088 			case 2:
2089 				output.op = CF_OP_MEM_STREAM2;
2090 				break;
2091 			case 3:
2092 				output.op = CF_OP_MEM_STREAM3;
2093 					break;
2094 			}
2095 			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2096 		}
2097 		r = r600_bytecode_add_output(ctx->bc, &output);
2098 		if (r)
2099 			goto out_err;
2100 	}
2101 	return 0;
2102 out_err:
2103 	return r;
2104 }
2105 
convert_edgeflag_to_int(struct r600_shader_ctx * ctx)2106 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2107 {
2108 	struct r600_bytecode_alu alu;
2109 	unsigned reg;
2110 
2111 	if (!ctx->shader->vs_out_edgeflag)
2112 		return;
2113 
2114 	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2115 
2116 	/* clamp(x, 0, 1) */
2117 	memset(&alu, 0, sizeof(alu));
2118 	alu.op = ALU_OP1_MOV;
2119 	alu.src[0].sel = reg;
2120 	alu.dst.sel = reg;
2121 	alu.dst.write = 1;
2122 	alu.dst.clamp = 1;
2123 	alu.last = 1;
2124 	r600_bytecode_add_alu(ctx->bc, &alu);
2125 
2126 	memset(&alu, 0, sizeof(alu));
2127 	alu.op = ALU_OP1_FLT_TO_INT;
2128 	alu.src[0].sel = reg;
2129 	alu.dst.sel = reg;
2130 	alu.dst.write = 1;
2131 	alu.last = 1;
2132 	r600_bytecode_add_alu(ctx->bc, &alu);
2133 }
2134 
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)2135 static int generate_gs_copy_shader(struct r600_context *rctx,
2136 				   struct r600_pipe_shader *gs,
2137 				   struct pipe_stream_output_info *so)
2138 {
2139 	struct r600_shader_ctx ctx = {};
2140 	struct r600_shader *gs_shader = &gs->shader;
2141 	struct r600_pipe_shader *cshader;
2142 	int ocnt = gs_shader->noutput;
2143 	struct r600_bytecode_alu alu;
2144 	struct r600_bytecode_vtx vtx;
2145 	struct r600_bytecode_output output;
2146 	struct r600_bytecode_cf *cf_jump, *cf_pop,
2147 		*last_exp_pos = NULL, *last_exp_param = NULL;
2148 	int i, j, next_clip_pos = 61, next_param = 0;
2149 	int ring;
2150 	bool only_ring_0 = true;
2151 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2152 	if (!cshader)
2153 		return 0;
2154 
2155 	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2156 	       sizeof(struct r600_shader_io));
2157 
2158 	cshader->shader.noutput = ocnt;
2159 
2160 	ctx.shader = &cshader->shader;
2161 	ctx.bc = &ctx.shader->bc;
2162 	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2163 
2164 	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2165 			   rctx->screen->has_compressed_msaa_texturing);
2166 
2167 	ctx.bc->isa = rctx->isa;
2168 
2169 	cf_jump = NULL;
2170 	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2171 
2172 	/* R0.x = R0.x & 0x3fffffff */
2173 	memset(&alu, 0, sizeof(alu));
2174 	alu.op = ALU_OP2_AND_INT;
2175 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2176 	alu.src[1].value = 0x3fffffff;
2177 	alu.dst.write = 1;
2178 	r600_bytecode_add_alu(ctx.bc, &alu);
2179 
2180 	/* R0.y = R0.x >> 30 */
2181 	memset(&alu, 0, sizeof(alu));
2182 	alu.op = ALU_OP2_LSHR_INT;
2183 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2184 	alu.src[1].value = 0x1e;
2185 	alu.dst.chan = 1;
2186 	alu.dst.write = 1;
2187 	alu.last = 1;
2188 	r600_bytecode_add_alu(ctx.bc, &alu);
2189 
2190 	/* fetch vertex data from GSVS ring */
2191 	for (i = 0; i < ocnt; ++i) {
2192 		struct r600_shader_io *out = &ctx.shader->output[i];
2193 
2194 		out->gpr = i + 1;
2195 		out->ring_offset = i * 16;
2196 
2197 		memset(&vtx, 0, sizeof(vtx));
2198 		vtx.op = FETCH_OP_VFETCH;
2199 		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2200 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2201 		vtx.mega_fetch_count = 16;
2202 		vtx.offset = out->ring_offset;
2203 		vtx.dst_gpr = out->gpr;
2204 		vtx.src_gpr = 0;
2205 		vtx.dst_sel_x = 0;
2206 		vtx.dst_sel_y = 1;
2207 		vtx.dst_sel_z = 2;
2208 		vtx.dst_sel_w = 3;
2209 		if (rctx->b.chip_class >= EVERGREEN) {
2210 			vtx.use_const_fields = 1;
2211 		} else {
2212 			vtx.data_format = FMT_32_32_32_32_FLOAT;
2213 		}
2214 
2215 		r600_bytecode_add_vtx(ctx.bc, &vtx);
2216 	}
2217 	ctx.temp_reg = i + 1;
2218 	for (ring = 3; ring >= 0; --ring) {
2219 		bool enabled = false;
2220 		for (i = 0; i < so->num_outputs; i++) {
2221 			if (so->output[i].stream == ring) {
2222 				enabled = true;
2223 				if (ring > 0)
2224 					only_ring_0 = false;
2225 				break;
2226 			}
2227 		}
2228 		if (ring != 0 && !enabled) {
2229 			cshader->shader.ring_item_sizes[ring] = 0;
2230 			continue;
2231 		}
2232 
2233 		if (cf_jump) {
2234 			// Patch up jump label
2235 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2236 			cf_pop = ctx.bc->cf_last;
2237 
2238 			cf_jump->cf_addr = cf_pop->id + 2;
2239 			cf_jump->pop_count = 1;
2240 			cf_pop->cf_addr = cf_pop->id + 2;
2241 			cf_pop->pop_count = 1;
2242 		}
2243 
2244 		/* PRED_SETE_INT __, R0.y, ring */
2245 		memset(&alu, 0, sizeof(alu));
2246 		alu.op = ALU_OP2_PRED_SETE_INT;
2247 		alu.src[0].chan = 1;
2248 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2249 		alu.src[1].value = ring;
2250 		alu.execute_mask = 1;
2251 		alu.update_pred = 1;
2252 		alu.last = 1;
2253 		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2254 
2255 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2256 		cf_jump = ctx.bc->cf_last;
2257 
2258 		if (enabled)
2259 			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2260 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2261 	}
2262 
2263 	/* bc adds nops - copy it */
2264 	if (ctx.bc->chip_class == R600) {
2265 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2266 		alu.op = ALU_OP0_NOP;
2267 		alu.last = 1;
2268 		r600_bytecode_add_alu(ctx.bc, &alu);
2269 
2270 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2271 	}
2272 
2273 	/* export vertex data */
2274 	/* XXX factor out common code with r600_shader_from_tgsi ? */
2275 	for (i = 0; i < ocnt; ++i) {
2276 		struct r600_shader_io *out = &ctx.shader->output[i];
2277 		bool instream0 = true;
2278 		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2279 			continue;
2280 
2281 		for (j = 0; j < so->num_outputs; j++) {
2282 			if (so->output[j].register_index == i) {
2283 				if (so->output[j].stream == 0)
2284 					break;
2285 				if (so->output[j].stream > 0)
2286 					instream0 = false;
2287 			}
2288 		}
2289 		if (!instream0)
2290 			continue;
2291 		memset(&output, 0, sizeof(output));
2292 		output.gpr = out->gpr;
2293 		output.elem_size = 3;
2294 		output.swizzle_x = 0;
2295 		output.swizzle_y = 1;
2296 		output.swizzle_z = 2;
2297 		output.swizzle_w = 3;
2298 		output.burst_count = 1;
2299 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2300 		output.op = CF_OP_EXPORT;
2301 		switch (out->name) {
2302 		case TGSI_SEMANTIC_POSITION:
2303 			output.array_base = 60;
2304 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2305 			break;
2306 
2307 		case TGSI_SEMANTIC_PSIZE:
2308 			output.array_base = 61;
2309 			if (next_clip_pos == 61)
2310 				next_clip_pos = 62;
2311 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2312 			output.swizzle_y = 7;
2313 			output.swizzle_z = 7;
2314 			output.swizzle_w = 7;
2315 			ctx.shader->vs_out_misc_write = 1;
2316 			ctx.shader->vs_out_point_size = 1;
2317 			break;
2318 		case TGSI_SEMANTIC_LAYER:
2319 			if (out->spi_sid) {
2320 				/* duplicate it as PARAM to pass to the pixel shader */
2321 				output.array_base = next_param++;
2322 				r600_bytecode_add_output(ctx.bc, &output);
2323 				last_exp_param = ctx.bc->cf_last;
2324 			}
2325 			output.array_base = 61;
2326 			if (next_clip_pos == 61)
2327 				next_clip_pos = 62;
2328 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2329 			output.swizzle_x = 7;
2330 			output.swizzle_y = 7;
2331 			output.swizzle_z = 0;
2332 			output.swizzle_w = 7;
2333 			ctx.shader->vs_out_misc_write = 1;
2334 			ctx.shader->vs_out_layer = 1;
2335 			break;
2336 		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2337 			if (out->spi_sid) {
2338 				/* duplicate it as PARAM to pass to the pixel shader */
2339 				output.array_base = next_param++;
2340 				r600_bytecode_add_output(ctx.bc, &output);
2341 				last_exp_param = ctx.bc->cf_last;
2342 			}
2343 			output.array_base = 61;
2344 			if (next_clip_pos == 61)
2345 				next_clip_pos = 62;
2346 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2347 			ctx.shader->vs_out_misc_write = 1;
2348 			ctx.shader->vs_out_viewport = 1;
2349 			output.swizzle_x = 7;
2350 			output.swizzle_y = 7;
2351 			output.swizzle_z = 7;
2352 			output.swizzle_w = 0;
2353 			break;
2354 		case TGSI_SEMANTIC_CLIPDIST:
2355 			/* spi_sid is 0 for clipdistance outputs that were generated
2356 			 * for clipvertex - we don't need to pass them to PS */
2357 			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2358 			if (out->spi_sid) {
2359 				/* duplicate it as PARAM to pass to the pixel shader */
2360 				output.array_base = next_param++;
2361 				r600_bytecode_add_output(ctx.bc, &output);
2362 				last_exp_param = ctx.bc->cf_last;
2363 			}
2364 			output.array_base = next_clip_pos++;
2365 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2366 			break;
2367 		case TGSI_SEMANTIC_FOG:
2368 			output.swizzle_y = 4; /* 0 */
2369 			output.swizzle_z = 4; /* 0 */
2370 			output.swizzle_w = 5; /* 1 */
2371 			break;
2372 		default:
2373 			output.array_base = next_param++;
2374 			break;
2375 		}
2376 		r600_bytecode_add_output(ctx.bc, &output);
2377 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2378 			last_exp_param = ctx.bc->cf_last;
2379 		else
2380 			last_exp_pos = ctx.bc->cf_last;
2381 	}
2382 
2383 	if (!last_exp_pos) {
2384 		memset(&output, 0, sizeof(output));
2385 		output.gpr = 0;
2386 		output.elem_size = 3;
2387 		output.swizzle_x = 7;
2388 		output.swizzle_y = 7;
2389 		output.swizzle_z = 7;
2390 		output.swizzle_w = 7;
2391 		output.burst_count = 1;
2392 		output.type = 2;
2393 		output.op = CF_OP_EXPORT;
2394 		output.array_base = 60;
2395 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2396 		r600_bytecode_add_output(ctx.bc, &output);
2397 		last_exp_pos = ctx.bc->cf_last;
2398 	}
2399 
2400 	if (!last_exp_param) {
2401 		memset(&output, 0, sizeof(output));
2402 		output.gpr = 0;
2403 		output.elem_size = 3;
2404 		output.swizzle_x = 7;
2405 		output.swizzle_y = 7;
2406 		output.swizzle_z = 7;
2407 		output.swizzle_w = 7;
2408 		output.burst_count = 1;
2409 		output.type = 2;
2410 		output.op = CF_OP_EXPORT;
2411 		output.array_base = next_param++;
2412 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2413 		r600_bytecode_add_output(ctx.bc, &output);
2414 		last_exp_param = ctx.bc->cf_last;
2415 	}
2416 
2417 	last_exp_pos->op = CF_OP_EXPORT_DONE;
2418 	last_exp_param->op = CF_OP_EXPORT_DONE;
2419 
2420 	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2421 	cf_pop = ctx.bc->cf_last;
2422 
2423 	cf_jump->cf_addr = cf_pop->id + 2;
2424 	cf_jump->pop_count = 1;
2425 	cf_pop->cf_addr = cf_pop->id + 2;
2426 	cf_pop->pop_count = 1;
2427 
2428 	if (ctx.bc->chip_class == CAYMAN)
2429 		cm_bytecode_add_cf_end(ctx.bc);
2430 	else {
2431 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2432 		ctx.bc->cf_last->end_of_program = 1;
2433 	}
2434 
2435 	gs->gs_copy_shader = cshader;
2436 	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2437 
2438 	ctx.bc->nstack = 1;
2439 
2440 	return r600_bytecode_build(ctx.bc);
2441 }
2442 
emit_inc_ring_offset(struct r600_shader_ctx * ctx,int idx,bool ind)2443 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2444 {
2445 	if (ind) {
2446 		struct r600_bytecode_alu alu;
2447 		int r;
2448 
2449 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2450 		alu.op = ALU_OP2_ADD_INT;
2451 		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2452 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2453 		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2454 		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2455 		alu.dst.write = 1;
2456 		alu.last = 1;
2457 		r = r600_bytecode_add_alu(ctx->bc, &alu);
2458 		if (r)
2459 			return r;
2460 	}
2461 	return 0;
2462 }
2463 
emit_gs_ring_writes(struct r600_shader_ctx * ctx,const struct pipe_stream_output_info * so,int stream,bool ind)2464 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
2465 {
2466 	struct r600_bytecode_output output;
2467 	int i, k, ring_offset;
2468 	int effective_stream = stream == -1 ? 0 : stream;
2469 	int idx = 0;
2470 
2471 	for (i = 0; i < ctx->shader->noutput; i++) {
2472 		if (ctx->gs_for_vs) {
2473 			/* for ES we need to lookup corresponding ring offset expected by GS
2474 			 * (map this output to GS input by name and sid) */
2475 			/* FIXME precompute offsets */
2476 			ring_offset = -1;
2477 			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2478 				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2479 				struct r600_shader_io *out = &ctx->shader->output[i];
2480 				if (in->name == out->name && in->sid == out->sid)
2481 					ring_offset = in->ring_offset;
2482 			}
2483 
2484 			if (ring_offset == -1)
2485 				continue;
2486 		} else {
2487 			ring_offset = idx * 16;
2488 			idx++;
2489 		}
2490 
2491 		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2492 			continue;
2493 		/* next_ring_offset after parsing input decls contains total size of
2494 		 * single vertex data, gs_next_vertex - current vertex index */
2495 		if (!ind)
2496 			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2497 
2498 		memset(&output, 0, sizeof(struct r600_bytecode_output));
2499 		output.gpr = ctx->shader->output[i].gpr;
2500 		output.elem_size = 3;
2501 		output.comp_mask = 0xF;
2502 		output.burst_count = 1;
2503 
2504 		if (ind)
2505 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2506 		else
2507 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2508 
2509 		switch (stream) {
2510 		default:
2511 		case 0:
2512 			output.op = CF_OP_MEM_RING; break;
2513 		case 1:
2514 			output.op = CF_OP_MEM_RING1; break;
2515 		case 2:
2516 			output.op = CF_OP_MEM_RING2; break;
2517 		case 3:
2518 			output.op = CF_OP_MEM_RING3; break;
2519 		}
2520 
2521 		if (ind) {
2522 			output.array_base = ring_offset >> 2; /* in dwords */
2523 			output.array_size = 0xfff;
2524 			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2525 		} else
2526 			output.array_base = ring_offset >> 2; /* in dwords */
2527 		r600_bytecode_add_output(ctx->bc, &output);
2528 	}
2529 
2530 	++ctx->gs_next_vertex;
2531 	return 0;
2532 }
2533 
2534 
r600_fetch_tess_io_info(struct r600_shader_ctx * ctx)2535 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2536 {
2537 	int r;
2538 	struct r600_bytecode_vtx vtx;
2539 	int temp_val = ctx->temp_reg;
2540 	/* need to store the TCS output somewhere */
2541 	r = single_alu_op2(ctx, ALU_OP1_MOV,
2542 			   temp_val, 0,
2543 			   V_SQ_ALU_SRC_LITERAL, 0,
2544 			   0, 0);
2545 	if (r)
2546 		return r;
2547 
2548 	/* used by VS/TCS */
2549 	if (ctx->tess_input_info) {
2550 		/* fetch tcs input values into resv space */
2551 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2552 		vtx.op = FETCH_OP_VFETCH;
2553 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2554 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2555 		vtx.mega_fetch_count = 16;
2556 		vtx.data_format = FMT_32_32_32_32;
2557 		vtx.num_format_all = 2;
2558 		vtx.format_comp_all = 1;
2559 		vtx.use_const_fields = 0;
2560 		vtx.endian = r600_endian_swap(32);
2561 		vtx.srf_mode_all = 1;
2562 		vtx.offset = 0;
2563 		vtx.dst_gpr = ctx->tess_input_info;
2564 		vtx.dst_sel_x = 0;
2565 		vtx.dst_sel_y = 1;
2566 		vtx.dst_sel_z = 2;
2567 		vtx.dst_sel_w = 3;
2568 		vtx.src_gpr = temp_val;
2569 		vtx.src_sel_x = 0;
2570 
2571 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2572 		if (r)
2573 			return r;
2574 	}
2575 
2576 	/* used by TCS/TES */
2577 	if (ctx->tess_output_info) {
2578 		/* fetch tcs output values into resv space */
2579 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2580 		vtx.op = FETCH_OP_VFETCH;
2581 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2582 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2583 		vtx.mega_fetch_count = 16;
2584 		vtx.data_format = FMT_32_32_32_32;
2585 		vtx.num_format_all = 2;
2586 		vtx.format_comp_all = 1;
2587 		vtx.use_const_fields = 0;
2588 		vtx.endian = r600_endian_swap(32);
2589 		vtx.srf_mode_all = 1;
2590 		vtx.offset = 16;
2591 		vtx.dst_gpr = ctx->tess_output_info;
2592 		vtx.dst_sel_x = 0;
2593 		vtx.dst_sel_y = 1;
2594 		vtx.dst_sel_z = 2;
2595 		vtx.dst_sel_w = 3;
2596 		vtx.src_gpr = temp_val;
2597 		vtx.src_sel_x = 0;
2598 
2599 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2600 		if (r)
2601 			return r;
2602 	}
2603 	return 0;
2604 }
2605 
emit_lds_vs_writes(struct r600_shader_ctx * ctx)2606 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2607 {
2608 	int i, j, r;
2609 	int temp_reg;
2610 
2611 	/* fetch tcs input values into input_vals */
2612 	ctx->tess_input_info = r600_get_temp(ctx);
2613 	ctx->tess_output_info = 0;
2614 	r = r600_fetch_tess_io_info(ctx);
2615 	if (r)
2616 		return r;
2617 
2618 	temp_reg = r600_get_temp(ctx);
2619 	/* dst reg contains LDS address stride * idx */
2620 	/* MUL vertexID, vertex_dw_stride */
2621 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2622 			   temp_reg, 0,
2623 			   ctx->tess_input_info, 1,
2624 			   0, 1); /* rel id in r0.y? */
2625 	if (r)
2626 		return r;
2627 
2628 	for (i = 0; i < ctx->shader->noutput; i++) {
2629 		struct r600_bytecode_alu alu;
2630 		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2631 
2632 		if (param) {
2633 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2634 					   temp_reg, 1,
2635 					   temp_reg, 0,
2636 					   V_SQ_ALU_SRC_LITERAL, param * 16);
2637 			if (r)
2638 				return r;
2639 		}
2640 
2641 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2642 				   temp_reg, 2,
2643 				   temp_reg, param ? 1 : 0,
2644 				   V_SQ_ALU_SRC_LITERAL, 8);
2645 		if (r)
2646 			return r;
2647 
2648 
2649 		for (j = 0; j < 2; j++) {
2650 			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2651 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2652 			alu.op = LDS_OP3_LDS_WRITE_REL;
2653 			alu.src[0].sel = temp_reg;
2654 			alu.src[0].chan = chan;
2655 			alu.src[1].sel = ctx->shader->output[i].gpr;
2656 			alu.src[1].chan = j * 2;
2657 			alu.src[2].sel = ctx->shader->output[i].gpr;
2658 			alu.src[2].chan = (j * 2) + 1;
2659 			alu.last = 1;
2660 			alu.dst.chan = 0;
2661 			alu.lds_idx = 1;
2662 			alu.is_lds_idx_op = true;
2663 			r = r600_bytecode_add_alu(ctx->bc, &alu);
2664 			if (r)
2665 				return r;
2666 		}
2667 	}
2668 	return 0;
2669 }
2670 
r600_store_tcs_output(struct r600_shader_ctx * ctx)2671 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2672 {
2673 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2674 	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2675 	int i, r, lasti;
2676 	int temp_reg = r600_get_temp(ctx);
2677 	struct r600_bytecode_alu alu;
2678 	unsigned write_mask = dst->Register.WriteMask;
2679 
2680 	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2681 		return 0;
2682 
2683 	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2684 	if (r)
2685 		return r;
2686 
2687 	/* the base address is now in temp.x */
2688 	r = r600_get_byte_address(ctx, temp_reg,
2689 				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2690 	if (r)
2691 		return r;
2692 
2693 	/* LDS write */
2694 	lasti = tgsi_last_instruction(write_mask);
2695 	for (i = 1; i <= lasti; i++) {
2696 
2697 		if (!(write_mask & (1 << i)))
2698 			continue;
2699 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2700 				   temp_reg, i,
2701 				   temp_reg, 0,
2702 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2703 		if (r)
2704 			return r;
2705 	}
2706 
2707 	for (i = 0; i <= lasti; i++) {
2708 		if (!(write_mask & (1 << i)))
2709 			continue;
2710 
2711 		if ((i == 0 && ((write_mask & 3) == 3)) ||
2712 		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
2713 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2714 			alu.op = LDS_OP3_LDS_WRITE_REL;
2715 			alu.src[0].sel = temp_reg;
2716 			alu.src[0].chan = i;
2717 
2718 			alu.src[1].sel = dst->Register.Index;
2719 			alu.src[1].sel += ctx->file_offset[dst->Register.File];
2720 			alu.src[1].chan = i;
2721 
2722 			alu.src[2].sel = dst->Register.Index;
2723 			alu.src[2].sel += ctx->file_offset[dst->Register.File];
2724 			alu.src[2].chan = i + 1;
2725 			alu.lds_idx = 1;
2726 			alu.dst.chan = 0;
2727 			alu.last = 1;
2728 			alu.is_lds_idx_op = true;
2729 			r = r600_bytecode_add_alu(ctx->bc, &alu);
2730 			if (r)
2731 				return r;
2732 			i += 1;
2733 			continue;
2734 		}
2735 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2736 		alu.op = LDS_OP2_LDS_WRITE;
2737 		alu.src[0].sel = temp_reg;
2738 		alu.src[0].chan = i;
2739 
2740 		alu.src[1].sel = dst->Register.Index;
2741 		alu.src[1].sel += ctx->file_offset[dst->Register.File];
2742 		alu.src[1].chan = i;
2743 
2744 		alu.src[2].sel = V_SQ_ALU_SRC_0;
2745 		alu.dst.chan = 0;
2746 		alu.last = 1;
2747 		alu.is_lds_idx_op = true;
2748 		r = r600_bytecode_add_alu(ctx->bc, &alu);
2749 		if (r)
2750 			return r;
2751 	}
2752 	return 0;
2753 }
2754 
r600_tess_factor_read(struct r600_shader_ctx * ctx,int output_idx)2755 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2756 				 int output_idx)
2757 {
2758 	int param;
2759 	unsigned temp_reg = r600_get_temp(ctx);
2760 	unsigned name = ctx->shader->output[output_idx].name;
2761 	int dreg = ctx->shader->output[output_idx].gpr;
2762 	int r;
2763 
2764 	param = r600_get_lds_unique_index(name, 0);
2765 	r = get_lds_offset0(ctx, 1, temp_reg, true);
2766 	if (r)
2767 		return r;
2768 
2769 	r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2770 			   temp_reg, 0,
2771 			   temp_reg, 0,
2772 			   V_SQ_ALU_SRC_LITERAL, param * 16);
2773 	if (r)
2774 		return r;
2775 
2776 	do_lds_fetch_values(ctx, temp_reg, dreg);
2777 	return 0;
2778 }
2779 
r600_emit_tess_factor(struct r600_shader_ctx * ctx)2780 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2781 {
2782 	unsigned i;
2783 	int stride, outer_comps, inner_comps;
2784 	int tessinner_idx = -1, tessouter_idx = -1;
2785 	int r;
2786 	int temp_reg = r600_get_temp(ctx);
2787 	int treg[3] = {-1, -1, -1};
2788 	struct r600_bytecode_alu alu;
2789 	struct r600_bytecode_cf *cf_jump, *cf_pop;
2790 
2791 	/* only execute factor emission for invocation 0 */
2792 	/* PRED_SETE_INT __, R0.x, 0 */
2793 	memset(&alu, 0, sizeof(alu));
2794 	alu.op = ALU_OP2_PRED_SETE_INT;
2795 	alu.src[0].chan = 2;
2796 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2797 	alu.execute_mask = 1;
2798 	alu.update_pred = 1;
2799 	alu.last = 1;
2800 	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2801 
2802 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2803 	cf_jump = ctx->bc->cf_last;
2804 
2805 	treg[0] = r600_get_temp(ctx);
2806 	switch (ctx->shader->tcs_prim_mode) {
2807 	case PIPE_PRIM_LINES:
2808 		stride = 8; /* 2 dwords, 1 vec2 store */
2809 		outer_comps = 2;
2810 		inner_comps = 0;
2811 		break;
2812 	case PIPE_PRIM_TRIANGLES:
2813 		stride = 16; /* 4 dwords, 1 vec4 store */
2814 		outer_comps = 3;
2815 		inner_comps = 1;
2816 		treg[1] = r600_get_temp(ctx);
2817 		break;
2818 	case PIPE_PRIM_QUADS:
2819 		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2820 		outer_comps = 4;
2821 		inner_comps = 2;
2822 		treg[1] = r600_get_temp(ctx);
2823 		treg[2] = r600_get_temp(ctx);
2824 		break;
2825 	default:
2826 		assert(0);
2827 		return -1;
2828 	}
2829 
2830 	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2831 	/* TF_WRITE takes index in R.x, value in R.y */
2832 	for (i = 0; i < ctx->shader->noutput; i++) {
2833 		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSINNER)
2834 			tessinner_idx = i;
2835 		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSOUTER)
2836 			tessouter_idx = i;
2837 	}
2838 
2839 	if (tessouter_idx == -1)
2840 		return -1;
2841 
2842 	if (tessinner_idx == -1 && inner_comps)
2843 		return -1;
2844 
2845 	if (tessouter_idx != -1) {
2846 		r = r600_tess_factor_read(ctx, tessouter_idx);
2847 		if (r)
2848 			return r;
2849 	}
2850 
2851 	if (tessinner_idx != -1) {
2852 		r = r600_tess_factor_read(ctx, tessinner_idx);
2853 		if (r)
2854 			return r;
2855 	}
2856 
2857 	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2858 	/* r.x = relpatchid(r0.y) * tf_stride */
2859 
2860 	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
2861 	/* add incoming r0.w to it: t.x = t.x + r0.w */
2862 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2863 			   temp_reg, 0,
2864 			   0, 1,
2865 			   V_SQ_ALU_SRC_LITERAL, stride,
2866 			   0, 3);
2867 	if (r)
2868 		return r;
2869 
2870 	for (i = 0; i < outer_comps + inner_comps; i++) {
2871 		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2872 		int out_comp = i >= outer_comps ? i - outer_comps : i;
2873 
2874 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2875 				   treg[i / 2], (2 * (i % 2)),
2876 				   temp_reg, 0,
2877 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2878 		if (r)
2879 			return r;
2880 		r = single_alu_op2(ctx, ALU_OP1_MOV,
2881 				   treg[i / 2], 1 + (2 * (i%2)),
2882 				   ctx->shader->output[out_idx].gpr, out_comp,
2883 				   0, 0);
2884 		if (r)
2885 			return r;
2886 	}
2887 	for (i = 0; i < outer_comps + inner_comps; i++) {
2888 		struct r600_bytecode_gds gds;
2889 
2890 		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2891 		gds.src_gpr = treg[i / 2];
2892 		gds.src_sel_x = 2 * (i % 2);
2893 		gds.src_sel_y = 1 + (2 * (i % 2));
2894 		gds.src_sel_z = 4;
2895 		gds.dst_sel_x = 7;
2896 		gds.dst_sel_y = 7;
2897 		gds.dst_sel_z = 7;
2898 		gds.dst_sel_w = 7;
2899 		gds.op = FETCH_OP_TF_WRITE;
2900 		r = r600_bytecode_add_gds(ctx->bc, &gds);
2901 		if (r)
2902 			return r;
2903 	}
2904 
2905 	// Patch up jump label
2906 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
2907 	cf_pop = ctx->bc->cf_last;
2908 
2909 	cf_jump->cf_addr = cf_pop->id + 2;
2910 	cf_jump->pop_count = 1;
2911 	cf_pop->cf_addr = cf_pop->id + 2;
2912 	cf_pop->pop_count = 1;
2913 
2914 	return 0;
2915 }
2916 
r600_shader_from_tgsi(struct r600_context * rctx,struct r600_pipe_shader * pipeshader,union r600_shader_key key)2917 static int r600_shader_from_tgsi(struct r600_context *rctx,
2918 				 struct r600_pipe_shader *pipeshader,
2919 				 union r600_shader_key key)
2920 {
2921 	struct r600_screen *rscreen = rctx->screen;
2922 	struct r600_shader *shader = &pipeshader->shader;
2923 	struct tgsi_token *tokens = pipeshader->selector->tokens;
2924 	struct pipe_stream_output_info so = pipeshader->selector->so;
2925 	struct tgsi_full_immediate *immediate;
2926 	struct r600_shader_ctx ctx;
2927 	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
2928 	unsigned output_done, noutput;
2929 	unsigned opcode;
2930 	int i, j, k, r = 0;
2931 	int next_param_base = 0, next_clip_base;
2932 	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
2933 	bool indirect_gprs;
2934 	bool ring_outputs = false;
2935 	bool lds_outputs = false;
2936 	bool lds_inputs = false;
2937 	bool pos_emitted = false;
2938 
2939 	ctx.bc = &shader->bc;
2940 	ctx.shader = shader;
2941 	ctx.native_integers = true;
2942 
2943 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
2944 			   rscreen->has_compressed_msaa_texturing);
2945 	ctx.tokens = tokens;
2946 	tgsi_scan_shader(tokens, &ctx.info);
2947 	shader->indirect_files = ctx.info.indirect_files;
2948 
2949 	shader->uses_doubles = ctx.info.uses_doubles;
2950 
2951 	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
2952 	tgsi_parse_init(&ctx.parse, tokens);
2953 	ctx.type = ctx.info.processor;
2954 	shader->processor_type = ctx.type;
2955 	ctx.bc->type = shader->processor_type;
2956 
2957 	switch (ctx.type) {
2958 	case PIPE_SHADER_VERTEX:
2959 		shader->vs_as_gs_a = key.vs.as_gs_a;
2960 		shader->vs_as_es = key.vs.as_es;
2961 		shader->vs_as_ls = key.vs.as_ls;
2962 		if (shader->vs_as_es)
2963 			ring_outputs = true;
2964 		if (shader->vs_as_ls)
2965 			lds_outputs = true;
2966 		break;
2967 	case PIPE_SHADER_GEOMETRY:
2968 		ring_outputs = true;
2969 		break;
2970 	case PIPE_SHADER_TESS_CTRL:
2971 		shader->tcs_prim_mode = key.tcs.prim_mode;
2972 		lds_outputs = true;
2973 		lds_inputs = true;
2974 		break;
2975 	case PIPE_SHADER_TESS_EVAL:
2976 		shader->tes_as_es = key.tes.as_es;
2977 		lds_inputs = true;
2978 		if (shader->tes_as_es)
2979 			ring_outputs = true;
2980 		break;
2981 	case PIPE_SHADER_FRAGMENT:
2982 		shader->two_side = key.ps.color_two_side;
2983 		break;
2984 	default:
2985 		break;
2986 	}
2987 
2988 	if (shader->vs_as_es || shader->tes_as_es) {
2989 		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
2990 	} else {
2991 		ctx.gs_for_vs = NULL;
2992 	}
2993 
2994 	ctx.next_ring_offset = 0;
2995 	ctx.gs_out_ring_offset = 0;
2996 	ctx.gs_next_vertex = 0;
2997 	ctx.gs_stream_output_info = &so;
2998 
2999 	ctx.face_gpr = -1;
3000 	ctx.fixed_pt_position_gpr = -1;
3001 	ctx.fragcoord_input = -1;
3002 	ctx.colors_used = 0;
3003 	ctx.clip_vertex_write = 0;
3004 
3005 	shader->nr_ps_color_exports = 0;
3006 	shader->nr_ps_max_color_exports = 0;
3007 
3008 
3009 	/* register allocations */
3010 	/* Values [0,127] correspond to GPR[0..127].
3011 	 * Values [128,159] correspond to constant buffer bank 0
3012 	 * Values [160,191] correspond to constant buffer bank 1
3013 	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3014 	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3015 	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3016 	 * Other special values are shown in the list below.
3017 	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3018 	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3019 	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3020 	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3021 	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3022 	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3023 	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3024 	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3025 	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3026 	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3027 	 * 254	SQ_ALU_SRC_PV: previous vector result.
3028 	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3029 	 */
3030 	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3031 		ctx.file_offset[i] = 0;
3032 	}
3033 
3034 	if (ctx.type == PIPE_SHADER_VERTEX) {
3035 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3036 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3037 	}
3038 	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3039 		if (ctx.bc->chip_class >= EVERGREEN)
3040 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3041 		else
3042 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3043 	}
3044 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3045 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3046 		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3047 	}
3048 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3049 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3050 	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3051 		bool add_tesscoord = false, add_tess_inout = false;
3052 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3053 		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3054 			/* if we have tesscoord save one reg */
3055 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3056 				add_tesscoord = true;
3057 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3058 			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3059 				add_tess_inout = true;
3060 		}
3061 		if (add_tesscoord || add_tess_inout)
3062 			ctx.file_offset[TGSI_FILE_INPUT]++;
3063 		if (add_tess_inout)
3064 			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3065 	}
3066 
3067 	ctx.file_offset[TGSI_FILE_OUTPUT] =
3068 			ctx.file_offset[TGSI_FILE_INPUT] +
3069 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3070 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3071 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3072 
3073 	/* Outside the GPR range. This will be translated to one of the
3074 	 * kcache banks later. */
3075 	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3076 
3077 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3078 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3079 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3080 	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3081 	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3082 
3083 	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3084 		ctx.tess_input_info = ctx.bc->ar_reg + 3;
3085 		ctx.tess_output_info = ctx.bc->ar_reg + 4;
3086 		ctx.temp_reg = ctx.bc->ar_reg + 5;
3087 	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3088 		ctx.tess_input_info = 0;
3089 		ctx.tess_output_info = ctx.bc->ar_reg + 3;
3090 		ctx.temp_reg = ctx.bc->ar_reg + 4;
3091 	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3092 		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3093 		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3094 		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3095 		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3096 		ctx.temp_reg = ctx.bc->ar_reg + 7;
3097 	} else {
3098 		ctx.temp_reg = ctx.bc->ar_reg + 3;
3099 	}
3100 
3101 	shader->max_arrays = 0;
3102 	shader->num_arrays = 0;
3103 	if (indirect_gprs) {
3104 
3105 		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3106 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3107 			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3108 			                   ctx.file_offset[TGSI_FILE_INPUT],
3109 			                   0x0F);
3110 		}
3111 		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3112 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3113 			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3114 			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3115 			                   0x0F);
3116 		}
3117 	}
3118 
3119 	ctx.nliterals = 0;
3120 	ctx.literals = NULL;
3121 
3122 	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3123 			       ctx.info.colors_written == 1;
3124 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3125 	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3126 
3127 	if (shader->vs_as_gs_a)
3128 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3129 
3130 	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3131 		r600_fetch_tess_io_info(&ctx);
3132 
3133 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3134 		tgsi_parse_token(&ctx.parse);
3135 		switch (ctx.parse.FullToken.Token.Type) {
3136 		case TGSI_TOKEN_TYPE_IMMEDIATE:
3137 			immediate = &ctx.parse.FullToken.FullImmediate;
3138 			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3139 			if(ctx.literals == NULL) {
3140 				r = -ENOMEM;
3141 				goto out_err;
3142 			}
3143 			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3144 			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3145 			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3146 			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3147 			ctx.nliterals++;
3148 			break;
3149 		case TGSI_TOKEN_TYPE_DECLARATION:
3150 			r = tgsi_declaration(&ctx);
3151 			if (r)
3152 				goto out_err;
3153 			break;
3154 		case TGSI_TOKEN_TYPE_INSTRUCTION:
3155 		case TGSI_TOKEN_TYPE_PROPERTY:
3156 			break;
3157 		default:
3158 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3159 			r = -EINVAL;
3160 			goto out_err;
3161 		}
3162 	}
3163 
3164 	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3165 	shader->ring_item_sizes[1] = 0;
3166 	shader->ring_item_sizes[2] = 0;
3167 	shader->ring_item_sizes[3] = 0;
3168 
3169 	/* Process two side if needed */
3170 	if (shader->two_side && ctx.colors_used) {
3171 		int i, count = ctx.shader->ninput;
3172 		unsigned next_lds_loc = ctx.shader->nlds;
3173 
3174 		/* additional inputs will be allocated right after the existing inputs,
3175 		 * we won't need them after the color selection, so we don't need to
3176 		 * reserve these gprs for the rest of the shader code and to adjust
3177 		 * output offsets etc. */
3178 		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3179 				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3180 
3181 		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3182 		if (ctx.face_gpr == -1) {
3183 			i = ctx.shader->ninput++;
3184 			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3185 			ctx.shader->input[i].spi_sid = 0;
3186 			ctx.shader->input[i].gpr = gpr++;
3187 			ctx.face_gpr = ctx.shader->input[i].gpr;
3188 		}
3189 
3190 		for (i = 0; i < count; i++) {
3191 			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3192 				int ni = ctx.shader->ninput++;
3193 				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3194 				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3195 				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3196 				ctx.shader->input[ni].gpr = gpr++;
3197 				// TGSI to LLVM needs to know the lds position of inputs.
3198 				// Non LLVM path computes it later (in process_twoside_color)
3199 				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3200 				ctx.shader->input[i].back_color_input = ni;
3201 				if (ctx.bc->chip_class >= EVERGREEN) {
3202 					if ((r = evergreen_interp_input(&ctx, ni)))
3203 						return r;
3204 				}
3205 			}
3206 		}
3207 	}
3208 
3209 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3210 		shader->nr_ps_max_color_exports = 8;
3211 
3212 	if (ctx.fragcoord_input >= 0) {
3213 		if (ctx.bc->chip_class == CAYMAN) {
3214 			for (j = 0 ; j < 4; j++) {
3215 				struct r600_bytecode_alu alu;
3216 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3217 				alu.op = ALU_OP1_RECIP_IEEE;
3218 				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3219 				alu.src[0].chan = 3;
3220 
3221 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3222 				alu.dst.chan = j;
3223 				alu.dst.write = (j == 3);
3224 				alu.last = 1;
3225 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3226 					return r;
3227 			}
3228 		} else {
3229 			struct r600_bytecode_alu alu;
3230 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3231 			alu.op = ALU_OP1_RECIP_IEEE;
3232 			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3233 			alu.src[0].chan = 3;
3234 
3235 			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3236 			alu.dst.chan = 3;
3237 			alu.dst.write = 1;
3238 			alu.last = 1;
3239 			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3240 				return r;
3241 		}
3242 	}
3243 
3244 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3245 		struct r600_bytecode_alu alu;
3246 		int r;
3247 
3248 		/* GS thread with no output workaround - emit a cut at start of GS */
3249 		if (ctx.bc->chip_class == R600)
3250 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3251 
3252 		for (j = 0; j < 4; j++) {
3253 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3254 			alu.op = ALU_OP1_MOV;
3255 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3256 			alu.src[0].value = 0;
3257 			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3258 			alu.dst.write = 1;
3259 			alu.last = 1;
3260 			r = r600_bytecode_add_alu(ctx.bc, &alu);
3261 			if (r)
3262 				return r;
3263 		}
3264 	}
3265 
3266 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3267 		r600_fetch_tess_io_info(&ctx);
3268 
3269 	if (shader->two_side && ctx.colors_used) {
3270 		if ((r = process_twoside_color_inputs(&ctx)))
3271 			return r;
3272 	}
3273 
3274 	tgsi_parse_init(&ctx.parse, tokens);
3275 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3276 		tgsi_parse_token(&ctx.parse);
3277 		switch (ctx.parse.FullToken.Token.Type) {
3278 		case TGSI_TOKEN_TYPE_INSTRUCTION:
3279 			r = tgsi_is_supported(&ctx);
3280 			if (r)
3281 				goto out_err;
3282 			ctx.max_driver_temp_used = 0;
3283 			/* reserve first tmp for everyone */
3284 			r600_get_temp(&ctx);
3285 
3286 			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3287 			if ((r = tgsi_split_constant(&ctx)))
3288 				goto out_err;
3289 			if ((r = tgsi_split_literal_constant(&ctx)))
3290 				goto out_err;
3291 			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3292 				if ((r = tgsi_split_gs_inputs(&ctx)))
3293 					goto out_err;
3294 			} else if (lds_inputs) {
3295 				if ((r = tgsi_split_lds_inputs(&ctx)))
3296 					goto out_err;
3297 			}
3298 			if (ctx.bc->chip_class == CAYMAN)
3299 				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3300 			else if (ctx.bc->chip_class >= EVERGREEN)
3301 				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3302 			else
3303 				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3304 			r = ctx.inst_info->process(&ctx);
3305 			if (r)
3306 				goto out_err;
3307 
3308 			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3309 				r = r600_store_tcs_output(&ctx);
3310 				if (r)
3311 					goto out_err;
3312 			}
3313 			break;
3314 		default:
3315 			break;
3316 		}
3317 	}
3318 
3319 	/* Reset the temporary register counter. */
3320 	ctx.max_driver_temp_used = 0;
3321 
3322 	noutput = shader->noutput;
3323 
3324 	if (!ring_outputs && ctx.clip_vertex_write) {
3325 		unsigned clipdist_temp[2];
3326 
3327 		clipdist_temp[0] = r600_get_temp(&ctx);
3328 		clipdist_temp[1] = r600_get_temp(&ctx);
3329 
3330 		/* need to convert a clipvertex write into clipdistance writes and not export
3331 		   the clip vertex anymore */
3332 
3333 		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3334 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3335 		shader->output[noutput].gpr = clipdist_temp[0];
3336 		noutput++;
3337 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3338 		shader->output[noutput].gpr = clipdist_temp[1];
3339 		noutput++;
3340 
3341 		/* reset spi_sid for clipvertex output to avoid confusing spi */
3342 		shader->output[ctx.cv_output].spi_sid = 0;
3343 
3344 		shader->clip_dist_write = 0xFF;
3345 
3346 		for (i = 0; i < 8; i++) {
3347 			int oreg = i >> 2;
3348 			int ochan = i & 3;
3349 
3350 			for (j = 0; j < 4; j++) {
3351 				struct r600_bytecode_alu alu;
3352 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3353 				alu.op = ALU_OP2_DOT4;
3354 				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3355 				alu.src[0].chan = j;
3356 
3357 				alu.src[1].sel = 512 + i;
3358 				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3359 				alu.src[1].chan = j;
3360 
3361 				alu.dst.sel = clipdist_temp[oreg];
3362 				alu.dst.chan = j;
3363 				alu.dst.write = (j == ochan);
3364 				if (j == 3)
3365 					alu.last = 1;
3366 				r = r600_bytecode_add_alu(ctx.bc, &alu);
3367 				if (r)
3368 					return r;
3369 			}
3370 		}
3371 	}
3372 
3373 	/* Add stream outputs. */
3374 	if (so.num_outputs) {
3375 		bool emit = false;
3376 		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3377 			emit = true;
3378 		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3379 			emit = true;
3380 		if (emit)
3381 			emit_streamout(&ctx, &so, -1, NULL);
3382 	}
3383 	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3384 	convert_edgeflag_to_int(&ctx);
3385 
3386 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3387 		r600_emit_tess_factor(&ctx);
3388 
3389 	if (lds_outputs) {
3390 		if (ctx.type == PIPE_SHADER_VERTEX) {
3391 			if (ctx.shader->noutput)
3392 				emit_lds_vs_writes(&ctx);
3393 		}
3394 	} else if (ring_outputs) {
3395 		if (shader->vs_as_es || shader->tes_as_es) {
3396 			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3397 			ctx.gs_export_gpr_tregs[1] = -1;
3398 			ctx.gs_export_gpr_tregs[2] = -1;
3399 			ctx.gs_export_gpr_tregs[3] = -1;
3400 
3401 			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3402 		}
3403 	} else {
3404 		/* Export output */
3405 		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3406 
3407 		for (i = 0, j = 0; i < noutput; i++, j++) {
3408 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3409 			output[j].gpr = shader->output[i].gpr;
3410 			output[j].elem_size = 3;
3411 			output[j].swizzle_x = 0;
3412 			output[j].swizzle_y = 1;
3413 			output[j].swizzle_z = 2;
3414 			output[j].swizzle_w = 3;
3415 			output[j].burst_count = 1;
3416 			output[j].type = -1;
3417 			output[j].op = CF_OP_EXPORT;
3418 			switch (ctx.type) {
3419 			case PIPE_SHADER_VERTEX:
3420 			case PIPE_SHADER_TESS_EVAL:
3421 				switch (shader->output[i].name) {
3422 				case TGSI_SEMANTIC_POSITION:
3423 					output[j].array_base = 60;
3424 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3425 					pos_emitted = true;
3426 					break;
3427 
3428 				case TGSI_SEMANTIC_PSIZE:
3429 					output[j].array_base = 61;
3430 					output[j].swizzle_y = 7;
3431 					output[j].swizzle_z = 7;
3432 					output[j].swizzle_w = 7;
3433 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3434 					pos_emitted = true;
3435 					break;
3436 				case TGSI_SEMANTIC_EDGEFLAG:
3437 					output[j].array_base = 61;
3438 					output[j].swizzle_x = 7;
3439 					output[j].swizzle_y = 0;
3440 					output[j].swizzle_z = 7;
3441 					output[j].swizzle_w = 7;
3442 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3443 					pos_emitted = true;
3444 					break;
3445 				case TGSI_SEMANTIC_LAYER:
3446 					/* spi_sid is 0 for outputs that are
3447 					 * not consumed by PS */
3448 					if (shader->output[i].spi_sid) {
3449 						output[j].array_base = next_param_base++;
3450 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3451 						j++;
3452 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3453 					}
3454 					output[j].array_base = 61;
3455 					output[j].swizzle_x = 7;
3456 					output[j].swizzle_y = 7;
3457 					output[j].swizzle_z = 0;
3458 					output[j].swizzle_w = 7;
3459 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3460 					pos_emitted = true;
3461 					break;
3462 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
3463 					/* spi_sid is 0 for outputs that are
3464 					 * not consumed by PS */
3465 					if (shader->output[i].spi_sid) {
3466 						output[j].array_base = next_param_base++;
3467 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3468 						j++;
3469 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3470 					}
3471 					output[j].array_base = 61;
3472 					output[j].swizzle_x = 7;
3473 					output[j].swizzle_y = 7;
3474 					output[j].swizzle_z = 7;
3475 					output[j].swizzle_w = 0;
3476 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3477 					pos_emitted = true;
3478 					break;
3479 				case TGSI_SEMANTIC_CLIPVERTEX:
3480 					j--;
3481 					break;
3482 				case TGSI_SEMANTIC_CLIPDIST:
3483 					output[j].array_base = next_clip_base++;
3484 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3485 					pos_emitted = true;
3486 					/* spi_sid is 0 for clipdistance outputs that were generated
3487 					 * for clipvertex - we don't need to pass them to PS */
3488 					if (shader->output[i].spi_sid) {
3489 						j++;
3490 						/* duplicate it as PARAM to pass to the pixel shader */
3491 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3492 						output[j].array_base = next_param_base++;
3493 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3494 					}
3495 					break;
3496 				case TGSI_SEMANTIC_FOG:
3497 					output[j].swizzle_y = 4; /* 0 */
3498 					output[j].swizzle_z = 4; /* 0 */
3499 					output[j].swizzle_w = 5; /* 1 */
3500 					break;
3501 				case TGSI_SEMANTIC_PRIMID:
3502 					output[j].swizzle_x = 2;
3503 					output[j].swizzle_y = 4; /* 0 */
3504 					output[j].swizzle_z = 4; /* 0 */
3505 					output[j].swizzle_w = 4; /* 0 */
3506 					break;
3507 				}
3508 
3509 				break;
3510 			case PIPE_SHADER_FRAGMENT:
3511 				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3512 					/* never export more colors than the number of CBs */
3513 					if (shader->output[i].sid >= max_color_exports) {
3514 						/* skip export */
3515 						j--;
3516 						continue;
3517 					}
3518 					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3519 					output[j].array_base = shader->output[i].sid;
3520 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3521 					shader->nr_ps_color_exports++;
3522 					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3523 						for (k = 1; k < max_color_exports; k++) {
3524 							j++;
3525 							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3526 							output[j].gpr = shader->output[i].gpr;
3527 							output[j].elem_size = 3;
3528 							output[j].swizzle_x = 0;
3529 							output[j].swizzle_y = 1;
3530 							output[j].swizzle_z = 2;
3531 							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3532 							output[j].burst_count = 1;
3533 							output[j].array_base = k;
3534 							output[j].op = CF_OP_EXPORT;
3535 							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3536 							shader->nr_ps_color_exports++;
3537 						}
3538 					}
3539 				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3540 					output[j].array_base = 61;
3541 					output[j].swizzle_x = 2;
3542 					output[j].swizzle_y = 7;
3543 					output[j].swizzle_z = output[j].swizzle_w = 7;
3544 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3545 				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3546 					output[j].array_base = 61;
3547 					output[j].swizzle_x = 7;
3548 					output[j].swizzle_y = 1;
3549 					output[j].swizzle_z = output[j].swizzle_w = 7;
3550 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3551 				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3552 					output[j].array_base = 61;
3553 					output[j].swizzle_x = 7;
3554 					output[j].swizzle_y = 7;
3555 					output[j].swizzle_z = 0;
3556 					output[j].swizzle_w = 7;
3557 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3558 				} else {
3559 					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3560 					r = -EINVAL;
3561 					goto out_err;
3562 				}
3563 				break;
3564 			case PIPE_SHADER_TESS_CTRL:
3565 				break;
3566 			default:
3567 				R600_ERR("unsupported processor type %d\n", ctx.type);
3568 				r = -EINVAL;
3569 				goto out_err;
3570 			}
3571 
3572 			if (output[j].type==-1) {
3573 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3574 				output[j].array_base = next_param_base++;
3575 			}
3576 		}
3577 
3578 		/* add fake position export */
3579 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3580 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3581 			output[j].gpr = 0;
3582 			output[j].elem_size = 3;
3583 			output[j].swizzle_x = 7;
3584 			output[j].swizzle_y = 7;
3585 			output[j].swizzle_z = 7;
3586 			output[j].swizzle_w = 7;
3587 			output[j].burst_count = 1;
3588 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3589 			output[j].array_base = 60;
3590 			output[j].op = CF_OP_EXPORT;
3591 			j++;
3592 		}
3593 
3594 		/* add fake param output for vertex shader if no param is exported */
3595 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3596 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3597 			output[j].gpr = 0;
3598 			output[j].elem_size = 3;
3599 			output[j].swizzle_x = 7;
3600 			output[j].swizzle_y = 7;
3601 			output[j].swizzle_z = 7;
3602 			output[j].swizzle_w = 7;
3603 			output[j].burst_count = 1;
3604 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3605 			output[j].array_base = 0;
3606 			output[j].op = CF_OP_EXPORT;
3607 			j++;
3608 		}
3609 
3610 		/* add fake pixel export */
3611 		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3612 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3613 			output[j].gpr = 0;
3614 			output[j].elem_size = 3;
3615 			output[j].swizzle_x = 7;
3616 			output[j].swizzle_y = 7;
3617 			output[j].swizzle_z = 7;
3618 			output[j].swizzle_w = 7;
3619 			output[j].burst_count = 1;
3620 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3621 			output[j].array_base = 0;
3622 			output[j].op = CF_OP_EXPORT;
3623 			j++;
3624 			shader->nr_ps_color_exports++;
3625 		}
3626 
3627 		noutput = j;
3628 
3629 		/* set export done on last export of each type */
3630 		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
3631 			if (!(output_done & (1 << output[i].type))) {
3632 				output_done |= (1 << output[i].type);
3633 				output[i].op = CF_OP_EXPORT_DONE;
3634 			}
3635 		}
3636 		/* add output to bytecode */
3637 		for (i = 0; i < noutput; i++) {
3638 			r = r600_bytecode_add_output(ctx.bc, &output[i]);
3639 			if (r)
3640 				goto out_err;
3641 		}
3642 	}
3643 
3644 	/* add program end */
3645 	if (ctx.bc->chip_class == CAYMAN)
3646 		cm_bytecode_add_cf_end(ctx.bc);
3647 	else {
3648 		const struct cf_op_info *last = NULL;
3649 
3650 		if (ctx.bc->cf_last)
3651 			last = r600_isa_cf(ctx.bc->cf_last->op);
3652 
3653 		/* alu clause instructions don't have EOP bit, so add NOP */
3654 		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
3655 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3656 
3657 		ctx.bc->cf_last->end_of_program = 1;
3658 	}
3659 
3660 	/* check GPR limit - we have 124 = 128 - 4
3661 	 * (4 are reserved as alu clause temporary registers) */
3662 	if (ctx.bc->ngpr > 124) {
3663 		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3664 		r = -ENOMEM;
3665 		goto out_err;
3666 	}
3667 
3668 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3669 		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3670 			return r;
3671 	}
3672 
3673 	free(ctx.literals);
3674 	tgsi_parse_free(&ctx.parse);
3675 	return 0;
3676 out_err:
3677 	free(ctx.literals);
3678 	tgsi_parse_free(&ctx.parse);
3679 	return r;
3680 }
3681 
tgsi_unsupported(struct r600_shader_ctx * ctx)3682 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3683 {
3684 	const unsigned tgsi_opcode =
3685 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3686 	R600_ERR("%s tgsi opcode unsupported\n",
3687 		 tgsi_get_opcode_name(tgsi_opcode));
3688 	return -EINVAL;
3689 }
3690 
tgsi_end(struct r600_shader_ctx * ctx)3691 static int tgsi_end(struct r600_shader_ctx *ctx)
3692 {
3693 	return 0;
3694 }
3695 
r600_bytecode_src(struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src,unsigned chan)3696 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3697 			const struct r600_shader_src *shader_src,
3698 			unsigned chan)
3699 {
3700 	bc_src->sel = shader_src->sel;
3701 	bc_src->chan = shader_src->swizzle[chan];
3702 	bc_src->neg = shader_src->neg;
3703 	bc_src->abs = shader_src->abs;
3704 	bc_src->rel = shader_src->rel;
3705 	bc_src->value = shader_src->value[bc_src->chan];
3706 	bc_src->kc_bank = shader_src->kc_bank;
3707 	bc_src->kc_rel = shader_src->kc_rel;
3708 }
3709 
r600_bytecode_src_set_abs(struct r600_bytecode_alu_src * bc_src)3710 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3711 {
3712 	bc_src->abs = 1;
3713 	bc_src->neg = 0;
3714 }
3715 
r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src * bc_src)3716 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3717 {
3718 	bc_src->neg = !bc_src->neg;
3719 }
3720 
tgsi_dst(struct r600_shader_ctx * ctx,const struct tgsi_full_dst_register * tgsi_dst,unsigned swizzle,struct r600_bytecode_alu_dst * r600_dst)3721 static void tgsi_dst(struct r600_shader_ctx *ctx,
3722 		     const struct tgsi_full_dst_register *tgsi_dst,
3723 		     unsigned swizzle,
3724 		     struct r600_bytecode_alu_dst *r600_dst)
3725 {
3726 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3727 
3728 	r600_dst->sel = tgsi_dst->Register.Index;
3729 	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3730 	r600_dst->chan = swizzle;
3731 	r600_dst->write = 1;
3732 	if (inst->Instruction.Saturate) {
3733 		r600_dst->clamp = 1;
3734 	}
3735 	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
3736 		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3737 			return;
3738 		}
3739 	}
3740 	if (tgsi_dst->Register.Indirect)
3741 		r600_dst->rel = V_SQ_REL_RELATIVE;
3742 
3743 }
3744 
tgsi_op2_64_params(struct r600_shader_ctx * ctx,bool singledest,bool swap)3745 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
3746 {
3747 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3748 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3749 	struct r600_bytecode_alu alu;
3750 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3751 	int use_tmp = 0;
3752 
3753 	if (singledest) {
3754 		switch (write_mask) {
3755 		case 0x1:
3756 			write_mask = 0x3;
3757 			break;
3758 		case 0x2:
3759 			use_tmp = 1;
3760 			write_mask = 0x3;
3761 			break;
3762 		case 0x4:
3763 			write_mask = 0xc;
3764 			break;
3765 		case 0x8:
3766 			write_mask = 0xc;
3767 			use_tmp = 3;
3768 			break;
3769 		}
3770 	}
3771 
3772 	lasti = tgsi_last_instruction(write_mask);
3773 	for (i = 0; i <= lasti; i++) {
3774 
3775 		if (!(write_mask & (1 << i)))
3776 			continue;
3777 
3778 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3779 
3780 		if (singledest) {
3781 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3782 			if (use_tmp) {
3783 				alu.dst.sel = ctx->temp_reg;
3784 				alu.dst.chan = i;
3785 				alu.dst.write = 1;
3786 			}
3787 			if (i == 1 || i == 3)
3788 				alu.dst.write = 0;
3789 		} else
3790 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3791 
3792 		alu.op = ctx->inst_info->op;
3793 		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3794 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3795 		} else if (!swap) {
3796 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3797 				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3798 			}
3799 		} else {
3800 			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3801 			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3802 		}
3803 
3804 		/* handle some special cases */
3805 		if (i == 1 || i == 3) {
3806 			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3807 			case TGSI_OPCODE_DABS:
3808 				r600_bytecode_src_set_abs(&alu.src[0]);
3809 				break;
3810 			default:
3811 				break;
3812 			}
3813 		}
3814 		if (i == lasti) {
3815 			alu.last = 1;
3816 		}
3817 		r = r600_bytecode_add_alu(ctx->bc, &alu);
3818 		if (r)
3819 			return r;
3820 	}
3821 
3822 	if (use_tmp) {
3823 		write_mask = inst->Dst[0].Register.WriteMask;
3824 
3825 		/* move result from temp to dst */
3826 		for (i = 0; i <= lasti; i++) {
3827 			if (!(write_mask & (1 << i)))
3828 				continue;
3829 
3830 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3831 			alu.op = ALU_OP1_MOV;
3832 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3833 			alu.src[0].sel = ctx->temp_reg;
3834 			alu.src[0].chan = use_tmp - 1;
3835 			alu.last = (i == lasti);
3836 
3837 			r = r600_bytecode_add_alu(ctx->bc, &alu);
3838 			if (r)
3839 				return r;
3840 		}
3841 	}
3842 	return 0;
3843 }
3844 
tgsi_op2_64(struct r600_shader_ctx * ctx)3845 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
3846 {
3847 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3848 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3849 	/* confirm writemasking */
3850 	if ((write_mask & 0x3) != 0x3 &&
3851 	    (write_mask & 0xc) != 0xc) {
3852 		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
3853 		return -1;
3854 	}
3855 	return tgsi_op2_64_params(ctx, false, false);
3856 }
3857 
tgsi_op2_64_single_dest(struct r600_shader_ctx * ctx)3858 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
3859 {
3860 	return tgsi_op2_64_params(ctx, true, false);
3861 }
3862 
tgsi_op2_64_single_dest_s(struct r600_shader_ctx * ctx)3863 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
3864 {
3865 	return tgsi_op2_64_params(ctx, true, true);
3866 }
3867 
tgsi_op3_64(struct r600_shader_ctx * ctx)3868 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
3869 {
3870 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3871 	struct r600_bytecode_alu alu;
3872 	int i, j, r;
3873 	int lasti = 3;
3874 	int tmp = r600_get_temp(ctx);
3875 
3876 	for (i = 0; i < lasti + 1; i++) {
3877 
3878 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3879 		alu.op = ctx->inst_info->op;
3880 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3881 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
3882 		}
3883 
3884 		if (inst->Dst[0].Register.WriteMask & (1 << i))
3885 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3886 		else
3887 			alu.dst.sel = tmp;
3888 
3889 		alu.dst.chan = i;
3890 		alu.is_op3 = 1;
3891 		if (i == lasti) {
3892 			alu.last = 1;
3893 		}
3894 		r = r600_bytecode_add_alu(ctx->bc, &alu);
3895 		if (r)
3896 			return r;
3897 	}
3898 	return 0;
3899 }
3900 
tgsi_op2_s(struct r600_shader_ctx * ctx,int swap,int trans_only)3901 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
3902 {
3903 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3904 	struct r600_bytecode_alu alu;
3905 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3906 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3907 	/* use temp register if trans_only and more than one dst component */
3908 	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
3909 
3910 	for (i = 0; i <= lasti; i++) {
3911 		if (!(write_mask & (1 << i)))
3912 			continue;
3913 
3914 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3915 		if (use_tmp) {
3916 			alu.dst.sel = ctx->temp_reg;
3917 			alu.dst.chan = i;
3918 			alu.dst.write = 1;
3919 		} else
3920 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3921 
3922 		alu.op = ctx->inst_info->op;
3923 		if (!swap) {
3924 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3925 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3926 			}
3927 		} else {
3928 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3929 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3930 		}
3931 		if (i == lasti || trans_only) {
3932 			alu.last = 1;
3933 		}
3934 		r = r600_bytecode_add_alu(ctx->bc, &alu);
3935 		if (r)
3936 			return r;
3937 	}
3938 
3939 	if (use_tmp) {
3940 		/* move result from temp to dst */
3941 		for (i = 0; i <= lasti; i++) {
3942 			if (!(write_mask & (1 << i)))
3943 				continue;
3944 
3945 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3946 			alu.op = ALU_OP1_MOV;
3947 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3948 			alu.src[0].sel = ctx->temp_reg;
3949 			alu.src[0].chan = i;
3950 			alu.last = (i == lasti);
3951 
3952 			r = r600_bytecode_add_alu(ctx->bc, &alu);
3953 			if (r)
3954 				return r;
3955 		}
3956 	}
3957 	return 0;
3958 }
3959 
tgsi_op2(struct r600_shader_ctx * ctx)3960 static int tgsi_op2(struct r600_shader_ctx *ctx)
3961 {
3962 	return tgsi_op2_s(ctx, 0, 0);
3963 }
3964 
tgsi_op2_swap(struct r600_shader_ctx * ctx)3965 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
3966 {
3967 	return tgsi_op2_s(ctx, 1, 0);
3968 }
3969 
tgsi_op2_trans(struct r600_shader_ctx * ctx)3970 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
3971 {
3972 	return tgsi_op2_s(ctx, 0, 1);
3973 }
3974 
tgsi_ineg(struct r600_shader_ctx * ctx)3975 static int tgsi_ineg(struct r600_shader_ctx *ctx)
3976 {
3977 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3978 	struct r600_bytecode_alu alu;
3979 	int i, r;
3980 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3981 
3982 	for (i = 0; i < lasti + 1; i++) {
3983 
3984 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3985 			continue;
3986 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3987 		alu.op = ctx->inst_info->op;
3988 
3989 		alu.src[0].sel = V_SQ_ALU_SRC_0;
3990 
3991 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3992 
3993 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3994 
3995 		if (i == lasti) {
3996 			alu.last = 1;
3997 		}
3998 		r = r600_bytecode_add_alu(ctx->bc, &alu);
3999 		if (r)
4000 			return r;
4001 	}
4002 	return 0;
4003 
4004 }
4005 
tgsi_dneg(struct r600_shader_ctx * ctx)4006 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4007 {
4008 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4009 	struct r600_bytecode_alu alu;
4010 	int i, r;
4011 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4012 
4013 	for (i = 0; i < lasti + 1; i++) {
4014 
4015 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4016 			continue;
4017 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4018 		alu.op = ALU_OP1_MOV;
4019 
4020 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4021 
4022 		if (i == 1 || i == 3)
4023 			r600_bytecode_src_toggle_neg(&alu.src[0]);
4024 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4025 
4026 		if (i == lasti) {
4027 			alu.last = 1;
4028 		}
4029 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4030 		if (r)
4031 			return r;
4032 	}
4033 	return 0;
4034 
4035 }
4036 
tgsi_dfracexp(struct r600_shader_ctx * ctx)4037 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4038 {
4039 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4040 	struct r600_bytecode_alu alu;
4041 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4042 	int i, j, r;
4043 	int firsti = write_mask == 0xc ? 2 : 0;
4044 
4045 	for (i = 0; i <= 3; i++) {
4046 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4047 		alu.op = ctx->inst_info->op;
4048 
4049 		alu.dst.sel = ctx->temp_reg;
4050 		alu.dst.chan = i;
4051 		alu.dst.write = 1;
4052 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4053 			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4054 		}
4055 
4056 		if (i == 3)
4057 			alu.last = 1;
4058 
4059 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4060 		if (r)
4061 			return r;
4062 	}
4063 
4064 	/* MOV first two channels to writemask dst0 */
4065 	for (i = 0; i <= 1; i++) {
4066 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4067 		alu.op = ALU_OP1_MOV;
4068 		alu.src[0].chan = i + 2;
4069 		alu.src[0].sel = ctx->temp_reg;
4070 
4071 		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
4072 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
4073 		alu.last = 1;
4074 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4075 		if (r)
4076 			return r;
4077 	}
4078 
4079 	for (i = 0; i <= 3; i++) {
4080 		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4081 			/* MOV third channels to writemask dst1 */
4082 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4083 			alu.op = ALU_OP1_MOV;
4084 			alu.src[0].chan = 1;
4085 			alu.src[0].sel = ctx->temp_reg;
4086 
4087 			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4088 			alu.last = 1;
4089 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4090 			if (r)
4091 				return r;
4092 			break;
4093 		}
4094 	}
4095 	return 0;
4096 }
4097 
4098 
egcm_int_to_double(struct r600_shader_ctx * ctx)4099 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4100 {
4101 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4102 	struct r600_bytecode_alu alu;
4103 	int i, r;
4104 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4105 
4106 	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4107 		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4108 
4109 	for (i = 0; i <= (lasti+1)/2; i++) {
4110 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4111 		alu.op = ctx->inst_info->op;
4112 
4113 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4114 		alu.dst.sel = ctx->temp_reg;
4115 		alu.dst.chan = i;
4116 		alu.dst.write = 1;
4117 		alu.last = 1;
4118 
4119 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4120 		if (r)
4121 			return r;
4122 	}
4123 
4124 	for (i = 0; i <= lasti; i++) {
4125 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4126 		alu.op = ALU_OP1_FLT32_TO_FLT64;
4127 
4128 		alu.src[0].chan = i/2;
4129 		if (i%2 == 0)
4130 			alu.src[0].sel = ctx->temp_reg;
4131 		else {
4132 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4133 			alu.src[0].value = 0x0;
4134 		}
4135 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4136 		alu.last = i == lasti;
4137 
4138 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4139 		if (r)
4140 			return r;
4141 	}
4142 
4143 	return 0;
4144 }
4145 
egcm_double_to_int(struct r600_shader_ctx * ctx)4146 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4147 {
4148 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4149 	struct r600_bytecode_alu alu;
4150 	int i, r;
4151 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4152 
4153 	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4154 		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4155 
4156 	for (i = 0; i <= lasti; i++) {
4157 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4158 		alu.op = ALU_OP1_FLT64_TO_FLT32;
4159 
4160 		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
4161 		alu.dst.chan = i;
4162 		alu.dst.sel = ctx->temp_reg;
4163 		alu.dst.write = i%2 == 0;
4164 		alu.last = i == lasti;
4165 
4166 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4167 		if (r)
4168 			return r;
4169 	}
4170 
4171 	for (i = 0; i <= (lasti+1)/2; i++) {
4172 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4173 		alu.op = ctx->inst_info->op;
4174 
4175 		alu.src[0].chan = i*2;
4176 		alu.src[0].sel = ctx->temp_reg;
4177 		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4178 		alu.last = 1;
4179 
4180 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4181 		if (r)
4182 			return r;
4183 	}
4184 
4185 	return 0;
4186 }
4187 
cayman_emit_unary_double_raw(struct r600_bytecode * bc,unsigned op,int dst_reg,struct r600_shader_src * src,bool abs)4188 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4189 					unsigned op,
4190 					int dst_reg,
4191 					struct r600_shader_src *src,
4192 					bool abs)
4193 {
4194 	struct r600_bytecode_alu alu;
4195 	const int last_slot = 3;
4196 	int r;
4197 
4198 	/* these have to write the result to X/Y by the looks of it */
4199 	for (int i = 0 ; i < last_slot; i++) {
4200 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4201 		alu.op = op;
4202 
4203 		r600_bytecode_src(&alu.src[0], src, 1);
4204 		r600_bytecode_src(&alu.src[1], src, 0);
4205 
4206 		if (abs)
4207 			r600_bytecode_src_set_abs(&alu.src[1]);
4208 
4209 		alu.dst.sel = dst_reg;
4210 		alu.dst.chan = i;
4211 		alu.dst.write = (i == 0 || i == 1);
4212 
4213 		if (bc->chip_class != CAYMAN || i == last_slot - 1)
4214 			alu.last = 1;
4215 		r = r600_bytecode_add_alu(bc, &alu);
4216 		if (r)
4217 			return r;
4218 	}
4219 
4220 	return 0;
4221 }
4222 
cayman_emit_double_instr(struct r600_shader_ctx * ctx)4223 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4224 {
4225 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4226 	int i, r;
4227 	struct r600_bytecode_alu alu;
4228 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4229 	int t1 = ctx->temp_reg;
4230 
4231 	/* should only be one src regs */
4232 	assert(inst->Instruction.NumSrcRegs == 1);
4233 
4234 	/* only support one double at a time */
4235 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4236 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4237 
4238 	r = cayman_emit_unary_double_raw(
4239 		ctx->bc, ctx->inst_info->op, t1,
4240 		&ctx->src[0],
4241 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4242 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4243 	if (r)
4244 		return r;
4245 
4246 	for (i = 0 ; i <= lasti; i++) {
4247 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4248 			continue;
4249 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4250 		alu.op = ALU_OP1_MOV;
4251 		alu.src[0].sel = t1;
4252 		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4253 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4254 		alu.dst.write = 1;
4255 		if (i == lasti)
4256 			alu.last = 1;
4257 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4258 		if (r)
4259 			return r;
4260 	}
4261 	return 0;
4262 }
4263 
cayman_emit_float_instr(struct r600_shader_ctx * ctx)4264 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4265 {
4266 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4267 	int i, j, r;
4268 	struct r600_bytecode_alu alu;
4269 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4270 
4271 	for (i = 0 ; i < last_slot; i++) {
4272 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4273 		alu.op = ctx->inst_info->op;
4274 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4275 			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4276 
4277 			/* RSQ should take the absolute value of src */
4278 			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4279 				r600_bytecode_src_set_abs(&alu.src[j]);
4280 			}
4281 		}
4282 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4283 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4284 
4285 		if (i == last_slot - 1)
4286 			alu.last = 1;
4287 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4288 		if (r)
4289 			return r;
4290 	}
4291 	return 0;
4292 }
4293 
cayman_mul_int_instr(struct r600_shader_ctx * ctx)4294 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4295 {
4296 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4297 	int i, j, k, r;
4298 	struct r600_bytecode_alu alu;
4299 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4300 	int t1 = ctx->temp_reg;
4301 
4302 	for (k = 0; k <= lasti; k++) {
4303 		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4304 			continue;
4305 
4306 		for (i = 0 ; i < 4; i++) {
4307 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4308 			alu.op = ctx->inst_info->op;
4309 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4310 				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4311 			}
4312 			alu.dst.sel = t1;
4313 			alu.dst.chan = i;
4314 			alu.dst.write = (i == k);
4315 			if (i == 3)
4316 				alu.last = 1;
4317 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4318 			if (r)
4319 				return r;
4320 		}
4321 	}
4322 
4323 	for (i = 0 ; i <= lasti; i++) {
4324 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4325 			continue;
4326 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4327 		alu.op = ALU_OP1_MOV;
4328 		alu.src[0].sel = t1;
4329 		alu.src[0].chan = i;
4330 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4331 		alu.dst.write = 1;
4332 		if (i == lasti)
4333 			alu.last = 1;
4334 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4335 		if (r)
4336 			return r;
4337 	}
4338 
4339 	return 0;
4340 }
4341 
4342 
cayman_mul_double_instr(struct r600_shader_ctx * ctx)4343 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4344 {
4345 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4346 	int i, j, k, r;
4347 	struct r600_bytecode_alu alu;
4348 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4349 	int t1 = ctx->temp_reg;
4350 
4351 	/* t1 would get overwritten below if we actually tried to
4352 	 * multiply two pairs of doubles at a time. */
4353 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4354 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4355 
4356 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4357 
4358 	for (i = 0; i < 4; i++) {
4359 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4360 		alu.op = ctx->inst_info->op;
4361 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4362 			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4363 		}
4364 		alu.dst.sel = t1;
4365 		alu.dst.chan = i;
4366 		alu.dst.write = 1;
4367 		if (i == 3)
4368 			alu.last = 1;
4369 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4370 		if (r)
4371 			return r;
4372 	}
4373 
4374 	for (i = 0; i <= lasti; i++) {
4375 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4376 			continue;
4377 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4378 		alu.op = ALU_OP1_MOV;
4379 		alu.src[0].sel = t1;
4380 		alu.src[0].chan = i;
4381 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4382 		alu.dst.write = 1;
4383 		if (i == lasti)
4384 			alu.last = 1;
4385 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4386 		if (r)
4387 			return r;
4388 	}
4389 
4390 	return 0;
4391 }
4392 
4393 /*
4394  * Emit RECIP_64 + MUL_64 to implement division.
4395  */
cayman_ddiv_instr(struct r600_shader_ctx * ctx)4396 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4397 {
4398 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4399 	int r;
4400 	struct r600_bytecode_alu alu;
4401 	int t1 = ctx->temp_reg;
4402 	int k;
4403 
4404 	/* Only support one double at a time. This is the same constraint as
4405 	 * in DMUL lowering. */
4406 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4407 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4408 
4409 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4410 
4411 	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4412 	if (r)
4413 		return r;
4414 
4415 	for (int i = 0; i < 4; i++) {
4416 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4417 		alu.op = ALU_OP2_MUL_64;
4418 
4419 		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4420 
4421 		alu.src[1].sel = t1;
4422 		alu.src[1].chan = (i == 3) ? 0 : 1;
4423 
4424 		alu.dst.sel = t1;
4425 		alu.dst.chan = i;
4426 		alu.dst.write = 1;
4427 		if (i == 3)
4428 			alu.last = 1;
4429 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4430 		if (r)
4431 			return r;
4432 	}
4433 
4434 	for (int i = 0; i < 2; i++) {
4435 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4436 		alu.op = ALU_OP1_MOV;
4437 		alu.src[0].sel = t1;
4438 		alu.src[0].chan = i;
4439 		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4440 		alu.dst.write = 1;
4441 		if (i == 1)
4442 			alu.last = 1;
4443 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4444 		if (r)
4445 			return r;
4446 	}
4447 	return 0;
4448 }
4449 
4450 /*
4451  * r600 - trunc to -PI..PI range
4452  * r700 - normalize by dividing by 2PI
4453  * see fdo bug 27901
4454  */
tgsi_setup_trig(struct r600_shader_ctx * ctx)4455 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4456 {
4457 	int r;
4458 	struct r600_bytecode_alu alu;
4459 
4460 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4461 	alu.op = ALU_OP3_MULADD;
4462 	alu.is_op3 = 1;
4463 
4464 	alu.dst.chan = 0;
4465 	alu.dst.sel = ctx->temp_reg;
4466 	alu.dst.write = 1;
4467 
4468 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4469 
4470 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4471 	alu.src[1].chan = 0;
4472 	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4473 	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4474 	alu.src[2].chan = 0;
4475 	alu.last = 1;
4476 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4477 	if (r)
4478 		return r;
4479 
4480 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4481 	alu.op = ALU_OP1_FRACT;
4482 
4483 	alu.dst.chan = 0;
4484 	alu.dst.sel = ctx->temp_reg;
4485 	alu.dst.write = 1;
4486 
4487 	alu.src[0].sel = ctx->temp_reg;
4488 	alu.src[0].chan = 0;
4489 	alu.last = 1;
4490 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4491 	if (r)
4492 		return r;
4493 
4494 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4495 	alu.op = ALU_OP3_MULADD;
4496 	alu.is_op3 = 1;
4497 
4498 	alu.dst.chan = 0;
4499 	alu.dst.sel = ctx->temp_reg;
4500 	alu.dst.write = 1;
4501 
4502 	alu.src[0].sel = ctx->temp_reg;
4503 	alu.src[0].chan = 0;
4504 
4505 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4506 	alu.src[1].chan = 0;
4507 	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4508 	alu.src[2].chan = 0;
4509 
4510 	if (ctx->bc->chip_class == R600) {
4511 		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4512 		alu.src[2].value = u_bitcast_f2u(-M_PI);
4513 	} else {
4514 		alu.src[1].sel = V_SQ_ALU_SRC_1;
4515 		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4516 		alu.src[2].neg = 1;
4517 	}
4518 
4519 	alu.last = 1;
4520 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4521 	if (r)
4522 		return r;
4523 	return 0;
4524 }
4525 
cayman_trig(struct r600_shader_ctx * ctx)4526 static int cayman_trig(struct r600_shader_ctx *ctx)
4527 {
4528 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4529 	struct r600_bytecode_alu alu;
4530 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4531 	int i, r;
4532 
4533 	r = tgsi_setup_trig(ctx);
4534 	if (r)
4535 		return r;
4536 
4537 
4538 	for (i = 0; i < last_slot; i++) {
4539 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4540 		alu.op = ctx->inst_info->op;
4541 		alu.dst.chan = i;
4542 
4543 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4544 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4545 
4546 		alu.src[0].sel = ctx->temp_reg;
4547 		alu.src[0].chan = 0;
4548 		if (i == last_slot - 1)
4549 			alu.last = 1;
4550 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4551 		if (r)
4552 			return r;
4553 	}
4554 	return 0;
4555 }
4556 
tgsi_trig(struct r600_shader_ctx * ctx)4557 static int tgsi_trig(struct r600_shader_ctx *ctx)
4558 {
4559 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4560 	struct r600_bytecode_alu alu;
4561 	int i, r;
4562 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4563 
4564 	r = tgsi_setup_trig(ctx);
4565 	if (r)
4566 		return r;
4567 
4568 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4569 	alu.op = ctx->inst_info->op;
4570 	alu.dst.chan = 0;
4571 	alu.dst.sel = ctx->temp_reg;
4572 	alu.dst.write = 1;
4573 
4574 	alu.src[0].sel = ctx->temp_reg;
4575 	alu.src[0].chan = 0;
4576 	alu.last = 1;
4577 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4578 	if (r)
4579 		return r;
4580 
4581 	/* replicate result */
4582 	for (i = 0; i < lasti + 1; i++) {
4583 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4584 			continue;
4585 
4586 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4587 		alu.op = ALU_OP1_MOV;
4588 
4589 		alu.src[0].sel = ctx->temp_reg;
4590 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4591 		if (i == lasti)
4592 			alu.last = 1;
4593 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4594 		if (r)
4595 			return r;
4596 	}
4597 	return 0;
4598 }
4599 
tgsi_scs(struct r600_shader_ctx * ctx)4600 static int tgsi_scs(struct r600_shader_ctx *ctx)
4601 {
4602 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4603 	struct r600_bytecode_alu alu;
4604 	int i, r;
4605 
4606 	/* We'll only need the trig stuff if we are going to write to the
4607 	 * X or Y components of the destination vector.
4608 	 */
4609 	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
4610 		r = tgsi_setup_trig(ctx);
4611 		if (r)
4612 			return r;
4613 	}
4614 
4615 	/* dst.x = COS */
4616 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
4617 		if (ctx->bc->chip_class == CAYMAN) {
4618 			for (i = 0 ; i < 3; i++) {
4619 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4620 				alu.op = ALU_OP1_COS;
4621 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4622 
4623 				if (i == 0)
4624 					alu.dst.write = 1;
4625 				else
4626 					alu.dst.write = 0;
4627 				alu.src[0].sel = ctx->temp_reg;
4628 				alu.src[0].chan = 0;
4629 				if (i == 2)
4630 					alu.last = 1;
4631 				r = r600_bytecode_add_alu(ctx->bc, &alu);
4632 				if (r)
4633 					return r;
4634 			}
4635 		} else {
4636 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4637 			alu.op = ALU_OP1_COS;
4638 			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4639 
4640 			alu.src[0].sel = ctx->temp_reg;
4641 			alu.src[0].chan = 0;
4642 			alu.last = 1;
4643 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4644 			if (r)
4645 				return r;
4646 		}
4647 	}
4648 
4649 	/* dst.y = SIN */
4650 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
4651 		if (ctx->bc->chip_class == CAYMAN) {
4652 			for (i = 0 ; i < 3; i++) {
4653 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4654 				alu.op = ALU_OP1_SIN;
4655 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4656 				if (i == 1)
4657 					alu.dst.write = 1;
4658 				else
4659 					alu.dst.write = 0;
4660 				alu.src[0].sel = ctx->temp_reg;
4661 				alu.src[0].chan = 0;
4662 				if (i == 2)
4663 					alu.last = 1;
4664 				r = r600_bytecode_add_alu(ctx->bc, &alu);
4665 				if (r)
4666 					return r;
4667 			}
4668 		} else {
4669 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4670 			alu.op = ALU_OP1_SIN;
4671 			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4672 
4673 			alu.src[0].sel = ctx->temp_reg;
4674 			alu.src[0].chan = 0;
4675 			alu.last = 1;
4676 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4677 			if (r)
4678 				return r;
4679 		}
4680 	}
4681 
4682 	/* dst.z = 0.0; */
4683 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
4684 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685 
4686 		alu.op = ALU_OP1_MOV;
4687 
4688 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4689 
4690 		alu.src[0].sel = V_SQ_ALU_SRC_0;
4691 		alu.src[0].chan = 0;
4692 
4693 		alu.last = 1;
4694 
4695 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4696 		if (r)
4697 			return r;
4698 	}
4699 
4700 	/* dst.w = 1.0; */
4701 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
4702 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4703 
4704 		alu.op = ALU_OP1_MOV;
4705 
4706 		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4707 
4708 		alu.src[0].sel = V_SQ_ALU_SRC_1;
4709 		alu.src[0].chan = 0;
4710 
4711 		alu.last = 1;
4712 
4713 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4714 		if (r)
4715 			return r;
4716 	}
4717 
4718 	return 0;
4719 }
4720 
tgsi_kill(struct r600_shader_ctx * ctx)4721 static int tgsi_kill(struct r600_shader_ctx *ctx)
4722 {
4723 	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4724 	struct r600_bytecode_alu alu;
4725 	int i, r;
4726 
4727 	for (i = 0; i < 4; i++) {
4728 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4729 		alu.op = ctx->inst_info->op;
4730 
4731 		alu.dst.chan = i;
4732 
4733 		alu.src[0].sel = V_SQ_ALU_SRC_0;
4734 
4735 		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4736 			alu.src[1].sel = V_SQ_ALU_SRC_1;
4737 			alu.src[1].neg = 1;
4738 		} else {
4739 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4740 		}
4741 		if (i == 3) {
4742 			alu.last = 1;
4743 		}
4744 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4745 		if (r)
4746 			return r;
4747 	}
4748 
4749 	/* kill must be last in ALU */
4750 	ctx->bc->force_add_cf = 1;
4751 	ctx->shader->uses_kill = TRUE;
4752 	return 0;
4753 }
4754 
tgsi_lit(struct r600_shader_ctx * ctx)4755 static int tgsi_lit(struct r600_shader_ctx *ctx)
4756 {
4757 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4758 	struct r600_bytecode_alu alu;
4759 	int r;
4760 
4761 	/* tmp.x = max(src.y, 0.0) */
4762 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4763 	alu.op = ALU_OP2_MAX;
4764 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4765 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4766 	alu.src[1].chan = 1;
4767 
4768 	alu.dst.sel = ctx->temp_reg;
4769 	alu.dst.chan = 0;
4770 	alu.dst.write = 1;
4771 
4772 	alu.last = 1;
4773 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4774 	if (r)
4775 		return r;
4776 
4777 	if (inst->Dst[0].Register.WriteMask & (1 << 2))
4778 	{
4779 		int chan;
4780 		int sel;
4781 		unsigned i;
4782 
4783 		if (ctx->bc->chip_class == CAYMAN) {
4784 			for (i = 0; i < 3; i++) {
4785 				/* tmp.z = log(tmp.x) */
4786 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4787 				alu.op = ALU_OP1_LOG_CLAMPED;
4788 				alu.src[0].sel = ctx->temp_reg;
4789 				alu.src[0].chan = 0;
4790 				alu.dst.sel = ctx->temp_reg;
4791 				alu.dst.chan = i;
4792 				if (i == 2) {
4793 					alu.dst.write = 1;
4794 					alu.last = 1;
4795 				} else
4796 					alu.dst.write = 0;
4797 
4798 				r = r600_bytecode_add_alu(ctx->bc, &alu);
4799 				if (r)
4800 					return r;
4801 			}
4802 		} else {
4803 			/* tmp.z = log(tmp.x) */
4804 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4805 			alu.op = ALU_OP1_LOG_CLAMPED;
4806 			alu.src[0].sel = ctx->temp_reg;
4807 			alu.src[0].chan = 0;
4808 			alu.dst.sel = ctx->temp_reg;
4809 			alu.dst.chan = 2;
4810 			alu.dst.write = 1;
4811 			alu.last = 1;
4812 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4813 			if (r)
4814 				return r;
4815 		}
4816 
4817 		chan = alu.dst.chan;
4818 		sel = alu.dst.sel;
4819 
4820 		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4821 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4822 		alu.op = ALU_OP3_MUL_LIT;
4823 		alu.src[0].sel  = sel;
4824 		alu.src[0].chan = chan;
4825 		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4826 		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4827 		alu.dst.sel = ctx->temp_reg;
4828 		alu.dst.chan = 0;
4829 		alu.dst.write = 1;
4830 		alu.is_op3 = 1;
4831 		alu.last = 1;
4832 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4833 		if (r)
4834 			return r;
4835 
4836 		if (ctx->bc->chip_class == CAYMAN) {
4837 			for (i = 0; i < 3; i++) {
4838 				/* dst.z = exp(tmp.x) */
4839 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4840 				alu.op = ALU_OP1_EXP_IEEE;
4841 				alu.src[0].sel = ctx->temp_reg;
4842 				alu.src[0].chan = 0;
4843 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4844 				if (i == 2) {
4845 					alu.dst.write = 1;
4846 					alu.last = 1;
4847 				} else
4848 					alu.dst.write = 0;
4849 				r = r600_bytecode_add_alu(ctx->bc, &alu);
4850 				if (r)
4851 					return r;
4852 			}
4853 		} else {
4854 			/* dst.z = exp(tmp.x) */
4855 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856 			alu.op = ALU_OP1_EXP_IEEE;
4857 			alu.src[0].sel = ctx->temp_reg;
4858 			alu.src[0].chan = 0;
4859 			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4860 			alu.last = 1;
4861 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4862 			if (r)
4863 				return r;
4864 		}
4865 	}
4866 
4867 	/* dst.x, <- 1.0  */
4868 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4869 	alu.op = ALU_OP1_MOV;
4870 	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
4871 	alu.src[0].chan = 0;
4872 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4873 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4874 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4875 	if (r)
4876 		return r;
4877 
4878 	/* dst.y = max(src.x, 0.0) */
4879 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4880 	alu.op = ALU_OP2_MAX;
4881 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4882 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4883 	alu.src[1].chan = 0;
4884 	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4885 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4886 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4887 	if (r)
4888 		return r;
4889 
4890 	/* dst.w, <- 1.0  */
4891 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4892 	alu.op = ALU_OP1_MOV;
4893 	alu.src[0].sel  = V_SQ_ALU_SRC_1;
4894 	alu.src[0].chan = 0;
4895 	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4896 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4897 	alu.last = 1;
4898 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4899 	if (r)
4900 		return r;
4901 
4902 	return 0;
4903 }
4904 
tgsi_rsq(struct r600_shader_ctx * ctx)4905 static int tgsi_rsq(struct r600_shader_ctx *ctx)
4906 {
4907 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4908 	struct r600_bytecode_alu alu;
4909 	int i, r;
4910 
4911 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4912 
4913 	/* XXX:
4914 	 * For state trackers other than OpenGL, we'll want to use
4915 	 * _RECIPSQRT_IEEE instead.
4916 	 */
4917 	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
4918 
4919 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4920 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4921 		r600_bytecode_src_set_abs(&alu.src[i]);
4922 	}
4923 	alu.dst.sel = ctx->temp_reg;
4924 	alu.dst.write = 1;
4925 	alu.last = 1;
4926 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4927 	if (r)
4928 		return r;
4929 	/* replicate result */
4930 	return tgsi_helper_tempx_replicate(ctx);
4931 }
4932 
tgsi_helper_tempx_replicate(struct r600_shader_ctx * ctx)4933 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
4934 {
4935 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4936 	struct r600_bytecode_alu alu;
4937 	int i, r;
4938 
4939 	for (i = 0; i < 4; i++) {
4940 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4941 		alu.src[0].sel = ctx->temp_reg;
4942 		alu.op = ALU_OP1_MOV;
4943 		alu.dst.chan = i;
4944 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4945 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4946 		if (i == 3)
4947 			alu.last = 1;
4948 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4949 		if (r)
4950 			return r;
4951 	}
4952 	return 0;
4953 }
4954 
tgsi_trans_srcx_replicate(struct r600_shader_ctx * ctx)4955 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
4956 {
4957 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4958 	struct r600_bytecode_alu alu;
4959 	int i, r;
4960 
4961 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4962 	alu.op = ctx->inst_info->op;
4963 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4964 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4965 	}
4966 	alu.dst.sel = ctx->temp_reg;
4967 	alu.dst.write = 1;
4968 	alu.last = 1;
4969 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4970 	if (r)
4971 		return r;
4972 	/* replicate result */
4973 	return tgsi_helper_tempx_replicate(ctx);
4974 }
4975 
cayman_pow(struct r600_shader_ctx * ctx)4976 static int cayman_pow(struct r600_shader_ctx *ctx)
4977 {
4978 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4979 	int i, r;
4980 	struct r600_bytecode_alu alu;
4981 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4982 
4983 	for (i = 0; i < 3; i++) {
4984 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4985 		alu.op = ALU_OP1_LOG_IEEE;
4986 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4987 		alu.dst.sel = ctx->temp_reg;
4988 		alu.dst.chan = i;
4989 		alu.dst.write = 1;
4990 		if (i == 2)
4991 			alu.last = 1;
4992 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4993 		if (r)
4994 			return r;
4995 	}
4996 
4997 	/* b * LOG2(a) */
4998 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4999 	alu.op = ALU_OP2_MUL;
5000 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5001 	alu.src[1].sel = ctx->temp_reg;
5002 	alu.dst.sel = ctx->temp_reg;
5003 	alu.dst.write = 1;
5004 	alu.last = 1;
5005 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5006 	if (r)
5007 		return r;
5008 
5009 	for (i = 0; i < last_slot; i++) {
5010 		/* POW(a,b) = EXP2(b * LOG2(a))*/
5011 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5012 		alu.op = ALU_OP1_EXP_IEEE;
5013 		alu.src[0].sel = ctx->temp_reg;
5014 
5015 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5016 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5017 		if (i == last_slot - 1)
5018 			alu.last = 1;
5019 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5020 		if (r)
5021 			return r;
5022 	}
5023 	return 0;
5024 }
5025 
tgsi_pow(struct r600_shader_ctx * ctx)5026 static int tgsi_pow(struct r600_shader_ctx *ctx)
5027 {
5028 	struct r600_bytecode_alu alu;
5029 	int r;
5030 
5031 	/* LOG2(a) */
5032 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5033 	alu.op = ALU_OP1_LOG_IEEE;
5034 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5035 	alu.dst.sel = ctx->temp_reg;
5036 	alu.dst.write = 1;
5037 	alu.last = 1;
5038 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5039 	if (r)
5040 		return r;
5041 	/* b * LOG2(a) */
5042 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5043 	alu.op = ALU_OP2_MUL;
5044 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5045 	alu.src[1].sel = ctx->temp_reg;
5046 	alu.dst.sel = ctx->temp_reg;
5047 	alu.dst.write = 1;
5048 	alu.last = 1;
5049 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5050 	if (r)
5051 		return r;
5052 	/* POW(a,b) = EXP2(b * LOG2(a))*/
5053 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5054 	alu.op = ALU_OP1_EXP_IEEE;
5055 	alu.src[0].sel = ctx->temp_reg;
5056 	alu.dst.sel = ctx->temp_reg;
5057 	alu.dst.write = 1;
5058 	alu.last = 1;
5059 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5060 	if (r)
5061 		return r;
5062 	return tgsi_helper_tempx_replicate(ctx);
5063 }
5064 
tgsi_divmod(struct r600_shader_ctx * ctx,int mod,int signed_op)5065 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5066 {
5067 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5068 	struct r600_bytecode_alu alu;
5069 	int i, r, j;
5070 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5071 	int tmp0 = ctx->temp_reg;
5072 	int tmp1 = r600_get_temp(ctx);
5073 	int tmp2 = r600_get_temp(ctx);
5074 	int tmp3 = r600_get_temp(ctx);
5075 	/* Unsigned path:
5076 	 *
5077 	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5078 	 *
5079 	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5080 	 * 2. tmp0.z = lo (tmp0.x * src2)
5081 	 * 3. tmp0.w = -tmp0.z
5082 	 * 4. tmp0.y = hi (tmp0.x * src2)
5083 	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5084 	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5085 	 * 7. tmp1.x = tmp0.x - tmp0.w
5086 	 * 8. tmp1.y = tmp0.x + tmp0.w
5087 	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5088 	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5089 	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5090 	 *
5091 	 * 12. tmp0.w = src1 - tmp0.y       = r
5092 	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5093 	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5094 	 *
5095 	 * if DIV
5096 	 *
5097 	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5098 	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5099 	 *
5100 	 * else MOD
5101 	 *
5102 	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5103 	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5104 	 *
5105 	 * endif
5106 	 *
5107 	 * 17. tmp1.x = tmp1.x & tmp1.y
5108 	 *
5109 	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5110 	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5111 	 *
5112 	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5113 	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5114 	 *
5115 	 * Signed path:
5116 	 *
5117 	 * Same as unsigned, using abs values of the operands,
5118 	 * and fixing the sign of the result in the end.
5119 	 */
5120 
5121 	for (i = 0; i < 4; i++) {
5122 		if (!(write_mask & (1<<i)))
5123 			continue;
5124 
5125 		if (signed_op) {
5126 
5127 			/* tmp2.x = -src0 */
5128 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5129 			alu.op = ALU_OP2_SUB_INT;
5130 
5131 			alu.dst.sel = tmp2;
5132 			alu.dst.chan = 0;
5133 			alu.dst.write = 1;
5134 
5135 			alu.src[0].sel = V_SQ_ALU_SRC_0;
5136 
5137 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5138 
5139 			alu.last = 1;
5140 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5141 				return r;
5142 
5143 			/* tmp2.y = -src1 */
5144 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5145 			alu.op = ALU_OP2_SUB_INT;
5146 
5147 			alu.dst.sel = tmp2;
5148 			alu.dst.chan = 1;
5149 			alu.dst.write = 1;
5150 
5151 			alu.src[0].sel = V_SQ_ALU_SRC_0;
5152 
5153 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5154 
5155 			alu.last = 1;
5156 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5157 				return r;
5158 
5159 			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5160 			/* it will be a sign of the quotient */
5161 			if (!mod) {
5162 
5163 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5164 				alu.op = ALU_OP2_XOR_INT;
5165 
5166 				alu.dst.sel = tmp2;
5167 				alu.dst.chan = 2;
5168 				alu.dst.write = 1;
5169 
5170 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5171 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5172 
5173 				alu.last = 1;
5174 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5175 					return r;
5176 			}
5177 
5178 			/* tmp2.x = |src0| */
5179 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5180 			alu.op = ALU_OP3_CNDGE_INT;
5181 			alu.is_op3 = 1;
5182 
5183 			alu.dst.sel = tmp2;
5184 			alu.dst.chan = 0;
5185 			alu.dst.write = 1;
5186 
5187 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5188 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5189 			alu.src[2].sel = tmp2;
5190 			alu.src[2].chan = 0;
5191 
5192 			alu.last = 1;
5193 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5194 				return r;
5195 
5196 			/* tmp2.y = |src1| */
5197 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5198 			alu.op = ALU_OP3_CNDGE_INT;
5199 			alu.is_op3 = 1;
5200 
5201 			alu.dst.sel = tmp2;
5202 			alu.dst.chan = 1;
5203 			alu.dst.write = 1;
5204 
5205 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5206 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5207 			alu.src[2].sel = tmp2;
5208 			alu.src[2].chan = 1;
5209 
5210 			alu.last = 1;
5211 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5212 				return r;
5213 
5214 		}
5215 
5216 		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5217 		if (ctx->bc->chip_class == CAYMAN) {
5218 			/* tmp3.x = u2f(src2) */
5219 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5220 			alu.op = ALU_OP1_UINT_TO_FLT;
5221 
5222 			alu.dst.sel = tmp3;
5223 			alu.dst.chan = 0;
5224 			alu.dst.write = 1;
5225 
5226 			if (signed_op) {
5227 				alu.src[0].sel = tmp2;
5228 				alu.src[0].chan = 1;
5229 			} else {
5230 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5231 			}
5232 
5233 			alu.last = 1;
5234 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5235 				return r;
5236 
5237 			/* tmp0.x = recip(tmp3.x) */
5238 			for (j = 0 ; j < 3; j++) {
5239 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5240 				alu.op = ALU_OP1_RECIP_IEEE;
5241 
5242 				alu.dst.sel = tmp0;
5243 				alu.dst.chan = j;
5244 				alu.dst.write = (j == 0);
5245 
5246 				alu.src[0].sel = tmp3;
5247 				alu.src[0].chan = 0;
5248 
5249 				if (j == 2)
5250 					alu.last = 1;
5251 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5252 					return r;
5253 			}
5254 
5255 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5256 			alu.op = ALU_OP2_MUL;
5257 
5258 			alu.src[0].sel = tmp0;
5259 			alu.src[0].chan = 0;
5260 
5261 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5262 			alu.src[1].value = 0x4f800000;
5263 
5264 			alu.dst.sel = tmp3;
5265 			alu.dst.write = 1;
5266 			alu.last = 1;
5267 			r = r600_bytecode_add_alu(ctx->bc, &alu);
5268 			if (r)
5269 				return r;
5270 
5271 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5272 			alu.op = ALU_OP1_FLT_TO_UINT;
5273 
5274 			alu.dst.sel = tmp0;
5275 			alu.dst.chan = 0;
5276 			alu.dst.write = 1;
5277 
5278 			alu.src[0].sel = tmp3;
5279 			alu.src[0].chan = 0;
5280 
5281 			alu.last = 1;
5282 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5283 				return r;
5284 
5285 		} else {
5286 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5287 			alu.op = ALU_OP1_RECIP_UINT;
5288 
5289 			alu.dst.sel = tmp0;
5290 			alu.dst.chan = 0;
5291 			alu.dst.write = 1;
5292 
5293 			if (signed_op) {
5294 				alu.src[0].sel = tmp2;
5295 				alu.src[0].chan = 1;
5296 			} else {
5297 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5298 			}
5299 
5300 			alu.last = 1;
5301 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5302 				return r;
5303 		}
5304 
5305 		/* 2. tmp0.z = lo (tmp0.x * src2) */
5306 		if (ctx->bc->chip_class == CAYMAN) {
5307 			for (j = 0 ; j < 4; j++) {
5308 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5309 				alu.op = ALU_OP2_MULLO_UINT;
5310 
5311 				alu.dst.sel = tmp0;
5312 				alu.dst.chan = j;
5313 				alu.dst.write = (j == 2);
5314 
5315 				alu.src[0].sel = tmp0;
5316 				alu.src[0].chan = 0;
5317 				if (signed_op) {
5318 					alu.src[1].sel = tmp2;
5319 					alu.src[1].chan = 1;
5320 				} else {
5321 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5322 				}
5323 
5324 				alu.last = (j == 3);
5325 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5326 					return r;
5327 			}
5328 		} else {
5329 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5330 			alu.op = ALU_OP2_MULLO_UINT;
5331 
5332 			alu.dst.sel = tmp0;
5333 			alu.dst.chan = 2;
5334 			alu.dst.write = 1;
5335 
5336 			alu.src[0].sel = tmp0;
5337 			alu.src[0].chan = 0;
5338 			if (signed_op) {
5339 				alu.src[1].sel = tmp2;
5340 				alu.src[1].chan = 1;
5341 			} else {
5342 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5343 			}
5344 
5345 			alu.last = 1;
5346 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5347 				return r;
5348 		}
5349 
5350 		/* 3. tmp0.w = -tmp0.z */
5351 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5352 		alu.op = ALU_OP2_SUB_INT;
5353 
5354 		alu.dst.sel = tmp0;
5355 		alu.dst.chan = 3;
5356 		alu.dst.write = 1;
5357 
5358 		alu.src[0].sel = V_SQ_ALU_SRC_0;
5359 		alu.src[1].sel = tmp0;
5360 		alu.src[1].chan = 2;
5361 
5362 		alu.last = 1;
5363 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5364 			return r;
5365 
5366 		/* 4. tmp0.y = hi (tmp0.x * src2) */
5367 		if (ctx->bc->chip_class == CAYMAN) {
5368 			for (j = 0 ; j < 4; j++) {
5369 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5370 				alu.op = ALU_OP2_MULHI_UINT;
5371 
5372 				alu.dst.sel = tmp0;
5373 				alu.dst.chan = j;
5374 				alu.dst.write = (j == 1);
5375 
5376 				alu.src[0].sel = tmp0;
5377 				alu.src[0].chan = 0;
5378 
5379 				if (signed_op) {
5380 					alu.src[1].sel = tmp2;
5381 					alu.src[1].chan = 1;
5382 				} else {
5383 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5384 				}
5385 				alu.last = (j == 3);
5386 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5387 					return r;
5388 			}
5389 		} else {
5390 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5391 			alu.op = ALU_OP2_MULHI_UINT;
5392 
5393 			alu.dst.sel = tmp0;
5394 			alu.dst.chan = 1;
5395 			alu.dst.write = 1;
5396 
5397 			alu.src[0].sel = tmp0;
5398 			alu.src[0].chan = 0;
5399 
5400 			if (signed_op) {
5401 				alu.src[1].sel = tmp2;
5402 				alu.src[1].chan = 1;
5403 			} else {
5404 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5405 			}
5406 
5407 			alu.last = 1;
5408 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5409 				return r;
5410 		}
5411 
5412 		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
5413 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5414 		alu.op = ALU_OP3_CNDE_INT;
5415 		alu.is_op3 = 1;
5416 
5417 		alu.dst.sel = tmp0;
5418 		alu.dst.chan = 2;
5419 		alu.dst.write = 1;
5420 
5421 		alu.src[0].sel = tmp0;
5422 		alu.src[0].chan = 1;
5423 		alu.src[1].sel = tmp0;
5424 		alu.src[1].chan = 3;
5425 		alu.src[2].sel = tmp0;
5426 		alu.src[2].chan = 2;
5427 
5428 		alu.last = 1;
5429 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5430 			return r;
5431 
5432 		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
5433 		if (ctx->bc->chip_class == CAYMAN) {
5434 			for (j = 0 ; j < 4; j++) {
5435 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5436 				alu.op = ALU_OP2_MULHI_UINT;
5437 
5438 				alu.dst.sel = tmp0;
5439 				alu.dst.chan = j;
5440 				alu.dst.write = (j == 3);
5441 
5442 				alu.src[0].sel = tmp0;
5443 				alu.src[0].chan = 2;
5444 
5445 				alu.src[1].sel = tmp0;
5446 				alu.src[1].chan = 0;
5447 
5448 				alu.last = (j == 3);
5449 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5450 					return r;
5451 			}
5452 		} else {
5453 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5454 			alu.op = ALU_OP2_MULHI_UINT;
5455 
5456 			alu.dst.sel = tmp0;
5457 			alu.dst.chan = 3;
5458 			alu.dst.write = 1;
5459 
5460 			alu.src[0].sel = tmp0;
5461 			alu.src[0].chan = 2;
5462 
5463 			alu.src[1].sel = tmp0;
5464 			alu.src[1].chan = 0;
5465 
5466 			alu.last = 1;
5467 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5468 				return r;
5469 		}
5470 
5471 		/* 7. tmp1.x = tmp0.x - tmp0.w */
5472 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5473 		alu.op = ALU_OP2_SUB_INT;
5474 
5475 		alu.dst.sel = tmp1;
5476 		alu.dst.chan = 0;
5477 		alu.dst.write = 1;
5478 
5479 		alu.src[0].sel = tmp0;
5480 		alu.src[0].chan = 0;
5481 		alu.src[1].sel = tmp0;
5482 		alu.src[1].chan = 3;
5483 
5484 		alu.last = 1;
5485 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5486 			return r;
5487 
5488 		/* 8. tmp1.y = tmp0.x + tmp0.w */
5489 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5490 		alu.op = ALU_OP2_ADD_INT;
5491 
5492 		alu.dst.sel = tmp1;
5493 		alu.dst.chan = 1;
5494 		alu.dst.write = 1;
5495 
5496 		alu.src[0].sel = tmp0;
5497 		alu.src[0].chan = 0;
5498 		alu.src[1].sel = tmp0;
5499 		alu.src[1].chan = 3;
5500 
5501 		alu.last = 1;
5502 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5503 			return r;
5504 
5505 		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5506 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5507 		alu.op = ALU_OP3_CNDE_INT;
5508 		alu.is_op3 = 1;
5509 
5510 		alu.dst.sel = tmp0;
5511 		alu.dst.chan = 0;
5512 		alu.dst.write = 1;
5513 
5514 		alu.src[0].sel = tmp0;
5515 		alu.src[0].chan = 1;
5516 		alu.src[1].sel = tmp1;
5517 		alu.src[1].chan = 1;
5518 		alu.src[2].sel = tmp1;
5519 		alu.src[2].chan = 0;
5520 
5521 		alu.last = 1;
5522 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5523 			return r;
5524 
5525 		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
5526 		if (ctx->bc->chip_class == CAYMAN) {
5527 			for (j = 0 ; j < 4; j++) {
5528 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5529 				alu.op = ALU_OP2_MULHI_UINT;
5530 
5531 				alu.dst.sel = tmp0;
5532 				alu.dst.chan = j;
5533 				alu.dst.write = (j == 2);
5534 
5535 				alu.src[0].sel = tmp0;
5536 				alu.src[0].chan = 0;
5537 
5538 				if (signed_op) {
5539 					alu.src[1].sel = tmp2;
5540 					alu.src[1].chan = 0;
5541 				} else {
5542 					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5543 				}
5544 
5545 				alu.last = (j == 3);
5546 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5547 					return r;
5548 			}
5549 		} else {
5550 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5551 			alu.op = ALU_OP2_MULHI_UINT;
5552 
5553 			alu.dst.sel = tmp0;
5554 			alu.dst.chan = 2;
5555 			alu.dst.write = 1;
5556 
5557 			alu.src[0].sel = tmp0;
5558 			alu.src[0].chan = 0;
5559 
5560 			if (signed_op) {
5561 				alu.src[1].sel = tmp2;
5562 				alu.src[1].chan = 0;
5563 			} else {
5564 				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5565 			}
5566 
5567 			alu.last = 1;
5568 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5569 				return r;
5570 		}
5571 
5572 		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
5573 		if (ctx->bc->chip_class == CAYMAN) {
5574 			for (j = 0 ; j < 4; j++) {
5575 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5576 				alu.op = ALU_OP2_MULLO_UINT;
5577 
5578 				alu.dst.sel = tmp0;
5579 				alu.dst.chan = j;
5580 				alu.dst.write = (j == 1);
5581 
5582 				if (signed_op) {
5583 					alu.src[0].sel = tmp2;
5584 					alu.src[0].chan = 1;
5585 				} else {
5586 					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5587 				}
5588 
5589 				alu.src[1].sel = tmp0;
5590 				alu.src[1].chan = 2;
5591 
5592 				alu.last = (j == 3);
5593 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5594 					return r;
5595 			}
5596 		} else {
5597 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5598 			alu.op = ALU_OP2_MULLO_UINT;
5599 
5600 			alu.dst.sel = tmp0;
5601 			alu.dst.chan = 1;
5602 			alu.dst.write = 1;
5603 
5604 			if (signed_op) {
5605 				alu.src[0].sel = tmp2;
5606 				alu.src[0].chan = 1;
5607 			} else {
5608 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5609 			}
5610 
5611 			alu.src[1].sel = tmp0;
5612 			alu.src[1].chan = 2;
5613 
5614 			alu.last = 1;
5615 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5616 				return r;
5617 		}
5618 
5619 		/* 12. tmp0.w = src1 - tmp0.y       = r */
5620 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5621 		alu.op = ALU_OP2_SUB_INT;
5622 
5623 		alu.dst.sel = tmp0;
5624 		alu.dst.chan = 3;
5625 		alu.dst.write = 1;
5626 
5627 		if (signed_op) {
5628 			alu.src[0].sel = tmp2;
5629 			alu.src[0].chan = 0;
5630 		} else {
5631 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5632 		}
5633 
5634 		alu.src[1].sel = tmp0;
5635 		alu.src[1].chan = 1;
5636 
5637 		alu.last = 1;
5638 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5639 			return r;
5640 
5641 		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
5642 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5643 		alu.op = ALU_OP2_SETGE_UINT;
5644 
5645 		alu.dst.sel = tmp1;
5646 		alu.dst.chan = 0;
5647 		alu.dst.write = 1;
5648 
5649 		alu.src[0].sel = tmp0;
5650 		alu.src[0].chan = 3;
5651 		if (signed_op) {
5652 			alu.src[1].sel = tmp2;
5653 			alu.src[1].chan = 1;
5654 		} else {
5655 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5656 		}
5657 
5658 		alu.last = 1;
5659 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5660 			return r;
5661 
5662 		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
5663 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5664 		alu.op = ALU_OP2_SETGE_UINT;
5665 
5666 		alu.dst.sel = tmp1;
5667 		alu.dst.chan = 1;
5668 		alu.dst.write = 1;
5669 
5670 		if (signed_op) {
5671 			alu.src[0].sel = tmp2;
5672 			alu.src[0].chan = 0;
5673 		} else {
5674 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5675 		}
5676 
5677 		alu.src[1].sel = tmp0;
5678 		alu.src[1].chan = 1;
5679 
5680 		alu.last = 1;
5681 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5682 			return r;
5683 
5684 		if (mod) { /* UMOD */
5685 
5686 			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
5687 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5688 			alu.op = ALU_OP2_SUB_INT;
5689 
5690 			alu.dst.sel = tmp1;
5691 			alu.dst.chan = 2;
5692 			alu.dst.write = 1;
5693 
5694 			alu.src[0].sel = tmp0;
5695 			alu.src[0].chan = 3;
5696 
5697 			if (signed_op) {
5698 				alu.src[1].sel = tmp2;
5699 				alu.src[1].chan = 1;
5700 			} else {
5701 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5702 			}
5703 
5704 			alu.last = 1;
5705 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5706 				return r;
5707 
5708 			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
5709 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5710 			alu.op = ALU_OP2_ADD_INT;
5711 
5712 			alu.dst.sel = tmp1;
5713 			alu.dst.chan = 3;
5714 			alu.dst.write = 1;
5715 
5716 			alu.src[0].sel = tmp0;
5717 			alu.src[0].chan = 3;
5718 			if (signed_op) {
5719 				alu.src[1].sel = tmp2;
5720 				alu.src[1].chan = 1;
5721 			} else {
5722 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5723 			}
5724 
5725 			alu.last = 1;
5726 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5727 				return r;
5728 
5729 		} else { /* UDIV */
5730 
5731 			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
5732 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5733 			alu.op = ALU_OP2_ADD_INT;
5734 
5735 			alu.dst.sel = tmp1;
5736 			alu.dst.chan = 2;
5737 			alu.dst.write = 1;
5738 
5739 			alu.src[0].sel = tmp0;
5740 			alu.src[0].chan = 2;
5741 			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5742 
5743 			alu.last = 1;
5744 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5745 				return r;
5746 
5747 			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
5748 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5749 			alu.op = ALU_OP2_ADD_INT;
5750 
5751 			alu.dst.sel = tmp1;
5752 			alu.dst.chan = 3;
5753 			alu.dst.write = 1;
5754 
5755 			alu.src[0].sel = tmp0;
5756 			alu.src[0].chan = 2;
5757 			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5758 
5759 			alu.last = 1;
5760 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5761 				return r;
5762 
5763 		}
5764 
5765 		/* 17. tmp1.x = tmp1.x & tmp1.y */
5766 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5767 		alu.op = ALU_OP2_AND_INT;
5768 
5769 		alu.dst.sel = tmp1;
5770 		alu.dst.chan = 0;
5771 		alu.dst.write = 1;
5772 
5773 		alu.src[0].sel = tmp1;
5774 		alu.src[0].chan = 0;
5775 		alu.src[1].sel = tmp1;
5776 		alu.src[1].chan = 1;
5777 
5778 		alu.last = 1;
5779 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5780 			return r;
5781 
5782 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
5783 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
5784 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5785 		alu.op = ALU_OP3_CNDE_INT;
5786 		alu.is_op3 = 1;
5787 
5788 		alu.dst.sel = tmp0;
5789 		alu.dst.chan = 2;
5790 		alu.dst.write = 1;
5791 
5792 		alu.src[0].sel = tmp1;
5793 		alu.src[0].chan = 0;
5794 		alu.src[1].sel = tmp0;
5795 		alu.src[1].chan = mod ? 3 : 2;
5796 		alu.src[2].sel = tmp1;
5797 		alu.src[2].chan = 2;
5798 
5799 		alu.last = 1;
5800 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5801 			return r;
5802 
5803 		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5804 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5805 		alu.op = ALU_OP3_CNDE_INT;
5806 		alu.is_op3 = 1;
5807 
5808 		if (signed_op) {
5809 			alu.dst.sel = tmp0;
5810 			alu.dst.chan = 2;
5811 			alu.dst.write = 1;
5812 		} else {
5813 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5814 		}
5815 
5816 		alu.src[0].sel = tmp1;
5817 		alu.src[0].chan = 1;
5818 		alu.src[1].sel = tmp1;
5819 		alu.src[1].chan = 3;
5820 		alu.src[2].sel = tmp0;
5821 		alu.src[2].chan = 2;
5822 
5823 		alu.last = 1;
5824 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5825 			return r;
5826 
5827 		if (signed_op) {
5828 
5829 			/* fix the sign of the result */
5830 
5831 			if (mod) {
5832 
5833 				/* tmp0.x = -tmp0.z */
5834 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5835 				alu.op = ALU_OP2_SUB_INT;
5836 
5837 				alu.dst.sel = tmp0;
5838 				alu.dst.chan = 0;
5839 				alu.dst.write = 1;
5840 
5841 				alu.src[0].sel = V_SQ_ALU_SRC_0;
5842 				alu.src[1].sel = tmp0;
5843 				alu.src[1].chan = 2;
5844 
5845 				alu.last = 1;
5846 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5847 					return r;
5848 
5849 				/* sign of the remainder is the same as the sign of src0 */
5850 				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5851 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5852 				alu.op = ALU_OP3_CNDGE_INT;
5853 				alu.is_op3 = 1;
5854 
5855 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5856 
5857 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5858 				alu.src[1].sel = tmp0;
5859 				alu.src[1].chan = 2;
5860 				alu.src[2].sel = tmp0;
5861 				alu.src[2].chan = 0;
5862 
5863 				alu.last = 1;
5864 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5865 					return r;
5866 
5867 			} else {
5868 
5869 				/* tmp0.x = -tmp0.z */
5870 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5871 				alu.op = ALU_OP2_SUB_INT;
5872 
5873 				alu.dst.sel = tmp0;
5874 				alu.dst.chan = 0;
5875 				alu.dst.write = 1;
5876 
5877 				alu.src[0].sel = V_SQ_ALU_SRC_0;
5878 				alu.src[1].sel = tmp0;
5879 				alu.src[1].chan = 2;
5880 
5881 				alu.last = 1;
5882 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5883 					return r;
5884 
5885 				/* fix the quotient sign (same as the sign of src0*src1) */
5886 				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5887 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5888 				alu.op = ALU_OP3_CNDGE_INT;
5889 				alu.is_op3 = 1;
5890 
5891 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5892 
5893 				alu.src[0].sel = tmp2;
5894 				alu.src[0].chan = 2;
5895 				alu.src[1].sel = tmp0;
5896 				alu.src[1].chan = 2;
5897 				alu.src[2].sel = tmp0;
5898 				alu.src[2].chan = 0;
5899 
5900 				alu.last = 1;
5901 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5902 					return r;
5903 			}
5904 		}
5905 	}
5906 	return 0;
5907 }
5908 
tgsi_udiv(struct r600_shader_ctx * ctx)5909 static int tgsi_udiv(struct r600_shader_ctx *ctx)
5910 {
5911 	return tgsi_divmod(ctx, 0, 0);
5912 }
5913 
tgsi_umod(struct r600_shader_ctx * ctx)5914 static int tgsi_umod(struct r600_shader_ctx *ctx)
5915 {
5916 	return tgsi_divmod(ctx, 1, 0);
5917 }
5918 
tgsi_idiv(struct r600_shader_ctx * ctx)5919 static int tgsi_idiv(struct r600_shader_ctx *ctx)
5920 {
5921 	return tgsi_divmod(ctx, 0, 1);
5922 }
5923 
tgsi_imod(struct r600_shader_ctx * ctx)5924 static int tgsi_imod(struct r600_shader_ctx *ctx)
5925 {
5926 	return tgsi_divmod(ctx, 1, 1);
5927 }
5928 
5929 
tgsi_f2i(struct r600_shader_ctx * ctx)5930 static int tgsi_f2i(struct r600_shader_ctx *ctx)
5931 {
5932 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5933 	struct r600_bytecode_alu alu;
5934 	int i, r;
5935 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5936 	int last_inst = tgsi_last_instruction(write_mask);
5937 
5938 	for (i = 0; i < 4; i++) {
5939 		if (!(write_mask & (1<<i)))
5940 			continue;
5941 
5942 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5943 		alu.op = ALU_OP1_TRUNC;
5944 
5945 		alu.dst.sel = ctx->temp_reg;
5946 		alu.dst.chan = i;
5947 		alu.dst.write = 1;
5948 
5949 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5950 		if (i == last_inst)
5951 			alu.last = 1;
5952 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5953 		if (r)
5954 			return r;
5955 	}
5956 
5957 	for (i = 0; i < 4; i++) {
5958 		if (!(write_mask & (1<<i)))
5959 			continue;
5960 
5961 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5962 		alu.op = ctx->inst_info->op;
5963 
5964 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5965 
5966 		alu.src[0].sel = ctx->temp_reg;
5967 		alu.src[0].chan = i;
5968 
5969 		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
5970 			alu.last = 1;
5971 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5972 		if (r)
5973 			return r;
5974 	}
5975 
5976 	return 0;
5977 }
5978 
tgsi_iabs(struct r600_shader_ctx * ctx)5979 static int tgsi_iabs(struct r600_shader_ctx *ctx)
5980 {
5981 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5982 	struct r600_bytecode_alu alu;
5983 	int i, r;
5984 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5985 	int last_inst = tgsi_last_instruction(write_mask);
5986 
5987 	/* tmp = -src */
5988 	for (i = 0; i < 4; i++) {
5989 		if (!(write_mask & (1<<i)))
5990 			continue;
5991 
5992 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5993 		alu.op = ALU_OP2_SUB_INT;
5994 
5995 		alu.dst.sel = ctx->temp_reg;
5996 		alu.dst.chan = i;
5997 		alu.dst.write = 1;
5998 
5999 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6000 		alu.src[0].sel = V_SQ_ALU_SRC_0;
6001 
6002 		if (i == last_inst)
6003 			alu.last = 1;
6004 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6005 		if (r)
6006 			return r;
6007 	}
6008 
6009 	/* dst = (src >= 0 ? src : tmp) */
6010 	for (i = 0; i < 4; i++) {
6011 		if (!(write_mask & (1<<i)))
6012 			continue;
6013 
6014 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6015 		alu.op = ALU_OP3_CNDGE_INT;
6016 		alu.is_op3 = 1;
6017 		alu.dst.write = 1;
6018 
6019 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6020 
6021 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6022 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6023 		alu.src[2].sel = ctx->temp_reg;
6024 		alu.src[2].chan = i;
6025 
6026 		if (i == last_inst)
6027 			alu.last = 1;
6028 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6029 		if (r)
6030 			return r;
6031 	}
6032 	return 0;
6033 }
6034 
tgsi_issg(struct r600_shader_ctx * ctx)6035 static int tgsi_issg(struct r600_shader_ctx *ctx)
6036 {
6037 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6038 	struct r600_bytecode_alu alu;
6039 	int i, r;
6040 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6041 	int last_inst = tgsi_last_instruction(write_mask);
6042 
6043 	/* tmp = (src >= 0 ? src : -1) */
6044 	for (i = 0; i < 4; i++) {
6045 		if (!(write_mask & (1<<i)))
6046 			continue;
6047 
6048 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6049 		alu.op = ALU_OP3_CNDGE_INT;
6050 		alu.is_op3 = 1;
6051 
6052 		alu.dst.sel = ctx->temp_reg;
6053 		alu.dst.chan = i;
6054 		alu.dst.write = 1;
6055 
6056 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6057 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6058 		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6059 
6060 		if (i == last_inst)
6061 			alu.last = 1;
6062 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6063 		if (r)
6064 			return r;
6065 	}
6066 
6067 	/* dst = (tmp > 0 ? 1 : tmp) */
6068 	for (i = 0; i < 4; i++) {
6069 		if (!(write_mask & (1<<i)))
6070 			continue;
6071 
6072 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6073 		alu.op = ALU_OP3_CNDGT_INT;
6074 		alu.is_op3 = 1;
6075 		alu.dst.write = 1;
6076 
6077 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6078 
6079 		alu.src[0].sel = ctx->temp_reg;
6080 		alu.src[0].chan = i;
6081 
6082 		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6083 
6084 		alu.src[2].sel = ctx->temp_reg;
6085 		alu.src[2].chan = i;
6086 
6087 		if (i == last_inst)
6088 			alu.last = 1;
6089 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6090 		if (r)
6091 			return r;
6092 	}
6093 	return 0;
6094 }
6095 
6096 
6097 
tgsi_ssg(struct r600_shader_ctx * ctx)6098 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6099 {
6100 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6101 	struct r600_bytecode_alu alu;
6102 	int i, r;
6103 
6104 	/* tmp = (src > 0 ? 1 : src) */
6105 	for (i = 0; i < 4; i++) {
6106 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6107 		alu.op = ALU_OP3_CNDGT;
6108 		alu.is_op3 = 1;
6109 
6110 		alu.dst.sel = ctx->temp_reg;
6111 		alu.dst.chan = i;
6112 
6113 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6114 		alu.src[1].sel = V_SQ_ALU_SRC_1;
6115 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6116 
6117 		if (i == 3)
6118 			alu.last = 1;
6119 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6120 		if (r)
6121 			return r;
6122 	}
6123 
6124 	/* dst = (-tmp > 0 ? -1 : tmp) */
6125 	for (i = 0; i < 4; i++) {
6126 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6127 		alu.op = ALU_OP3_CNDGT;
6128 		alu.is_op3 = 1;
6129 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6130 
6131 		alu.src[0].sel = ctx->temp_reg;
6132 		alu.src[0].chan = i;
6133 		alu.src[0].neg = 1;
6134 
6135 		alu.src[1].sel = V_SQ_ALU_SRC_1;
6136 		alu.src[1].neg = 1;
6137 
6138 		alu.src[2].sel = ctx->temp_reg;
6139 		alu.src[2].chan = i;
6140 
6141 		if (i == 3)
6142 			alu.last = 1;
6143 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6144 		if (r)
6145 			return r;
6146 	}
6147 	return 0;
6148 }
6149 
tgsi_bfi(struct r600_shader_ctx * ctx)6150 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6151 {
6152 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6153 	struct r600_bytecode_alu alu;
6154 	int i, r, t1, t2;
6155 
6156 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6157 	int last_inst = tgsi_last_instruction(write_mask);
6158 
6159 	t1 = ctx->temp_reg;
6160 
6161 	for (i = 0; i < 4; i++) {
6162 		if (!(write_mask & (1<<i)))
6163 			continue;
6164 
6165 		/* create mask tmp */
6166 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6167 		alu.op = ALU_OP2_BFM_INT;
6168 		alu.dst.sel = t1;
6169 		alu.dst.chan = i;
6170 		alu.dst.write = 1;
6171 		alu.last = i == last_inst;
6172 
6173 		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6174 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6175 
6176 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6177 		if (r)
6178 			return r;
6179 	}
6180 
6181 	t2 = r600_get_temp(ctx);
6182 
6183 	for (i = 0; i < 4; i++) {
6184 		if (!(write_mask & (1<<i)))
6185 			continue;
6186 
6187 		/* shift insert left */
6188 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6189 		alu.op = ALU_OP2_LSHL_INT;
6190 		alu.dst.sel = t2;
6191 		alu.dst.chan = i;
6192 		alu.dst.write = 1;
6193 		alu.last = i == last_inst;
6194 
6195 		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6196 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6197 
6198 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6199 		if (r)
6200 			return r;
6201 	}
6202 
6203 	for (i = 0; i < 4; i++) {
6204 		if (!(write_mask & (1<<i)))
6205 			continue;
6206 
6207 		/* actual bitfield insert */
6208 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6209 		alu.op = ALU_OP3_BFI_INT;
6210 		alu.is_op3 = 1;
6211 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6212 		alu.dst.chan = i;
6213 		alu.dst.write = 1;
6214 		alu.last = i == last_inst;
6215 
6216 		alu.src[0].sel = t1;
6217 		alu.src[0].chan = i;
6218 		alu.src[1].sel = t2;
6219 		alu.src[1].chan = i;
6220 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6221 
6222 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6223 		if (r)
6224 			return r;
6225 	}
6226 
6227 	return 0;
6228 }
6229 
tgsi_msb(struct r600_shader_ctx * ctx)6230 static int tgsi_msb(struct r600_shader_ctx *ctx)
6231 {
6232 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6233 	struct r600_bytecode_alu alu;
6234 	int i, r, t1, t2;
6235 
6236 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6237 	int last_inst = tgsi_last_instruction(write_mask);
6238 
6239 	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6240 		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6241 
6242 	t1 = ctx->temp_reg;
6243 
6244 	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6245 	for (i = 0; i < 4; i++) {
6246 		if (!(write_mask & (1<<i)))
6247 			continue;
6248 
6249 		/* t1 = FFBH_INT / FFBH_UINT */
6250 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6251 		alu.op = ctx->inst_info->op;
6252 		alu.dst.sel = t1;
6253 		alu.dst.chan = i;
6254 		alu.dst.write = 1;
6255 		alu.last = i == last_inst;
6256 
6257 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6258 
6259 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6260 		if (r)
6261 			return r;
6262 	}
6263 
6264 	t2 = r600_get_temp(ctx);
6265 
6266 	for (i = 0; i < 4; i++) {
6267 		if (!(write_mask & (1<<i)))
6268 			continue;
6269 
6270 		/* t2 = 31 - t1 */
6271 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6272 		alu.op = ALU_OP2_SUB_INT;
6273 		alu.dst.sel = t2;
6274 		alu.dst.chan = i;
6275 		alu.dst.write = 1;
6276 		alu.last = i == last_inst;
6277 
6278 		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6279 		alu.src[0].value = 31;
6280 		alu.src[1].sel = t1;
6281 		alu.src[1].chan = i;
6282 
6283 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6284 		if (r)
6285 			return r;
6286 	}
6287 
6288 	for (i = 0; i < 4; i++) {
6289 		if (!(write_mask & (1<<i)))
6290 			continue;
6291 
6292 		/* result = t1 >= 0 ? t2 : t1 */
6293 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6294 		alu.op = ALU_OP3_CNDGE_INT;
6295 		alu.is_op3 = 1;
6296 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6297 		alu.dst.chan = i;
6298 		alu.dst.write = 1;
6299 		alu.last = i == last_inst;
6300 
6301 		alu.src[0].sel = t1;
6302 		alu.src[0].chan = i;
6303 		alu.src[1].sel = t2;
6304 		alu.src[1].chan = i;
6305 		alu.src[2].sel = t1;
6306 		alu.src[2].chan = i;
6307 
6308 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6309 		if (r)
6310 			return r;
6311 	}
6312 
6313 	return 0;
6314 }
6315 
tgsi_interp_egcm(struct r600_shader_ctx * ctx)6316 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6317 {
6318 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6319 	struct r600_bytecode_alu alu;
6320 	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6321 	unsigned location;
6322 	int input;
6323 
6324 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6325 
6326 	input = inst->Src[0].Register.Index;
6327 
6328 	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6329 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6330 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6331 		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6332 	}
6333 	else {
6334 		location = TGSI_INTERPOLATE_LOC_CENTROID;
6335 	}
6336 
6337 	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6338 	if (k < 0)
6339 		k = 0;
6340 	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6341 	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6342 
6343 	/* NOTE: currently offset is not perspective correct */
6344 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6345 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6346 		int sample_gpr = -1;
6347 		int gradientsH, gradientsV;
6348 		struct r600_bytecode_tex tex;
6349 
6350 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6351 			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6352 		}
6353 
6354 		gradientsH = r600_get_temp(ctx);
6355 		gradientsV = r600_get_temp(ctx);
6356 		for (i = 0; i < 2; i++) {
6357 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6358 			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6359 			tex.src_gpr = interp_gpr;
6360 			tex.src_sel_x = interp_base_chan + 0;
6361 			tex.src_sel_y = interp_base_chan + 1;
6362 			tex.src_sel_z = 0;
6363 			tex.src_sel_w = 0;
6364 			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6365 			tex.dst_sel_x = 0;
6366 			tex.dst_sel_y = 1;
6367 			tex.dst_sel_z = 7;
6368 			tex.dst_sel_w = 7;
6369 			tex.inst_mod = 1; // Use per pixel gradient calculation
6370 			tex.sampler_id = 0;
6371 			tex.resource_id = tex.sampler_id;
6372 			r = r600_bytecode_add_tex(ctx->bc, &tex);
6373 			if (r)
6374 				return r;
6375 		}
6376 
6377 		for (i = 0; i < 2; i++) {
6378 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6379 			alu.op = ALU_OP3_MULADD;
6380 			alu.is_op3 = 1;
6381 			alu.src[0].sel = gradientsH;
6382 			alu.src[0].chan = i;
6383 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6384 				alu.src[1].sel = sample_gpr;
6385 				alu.src[1].chan = 2;
6386 			}
6387 			else {
6388 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6389 			}
6390 			alu.src[2].sel = interp_gpr;
6391 			alu.src[2].chan = interp_base_chan + i;
6392 			alu.dst.sel = ctx->temp_reg;
6393 			alu.dst.chan = i;
6394 			alu.last = i == 1;
6395 
6396 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6397 			if (r)
6398 				return r;
6399 		}
6400 
6401 		for (i = 0; i < 2; i++) {
6402 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6403 			alu.op = ALU_OP3_MULADD;
6404 			alu.is_op3 = 1;
6405 			alu.src[0].sel = gradientsV;
6406 			alu.src[0].chan = i;
6407 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6408 				alu.src[1].sel = sample_gpr;
6409 				alu.src[1].chan = 3;
6410 			}
6411 			else {
6412 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6413 			}
6414 			alu.src[2].sel = ctx->temp_reg;
6415 			alu.src[2].chan = i;
6416 			alu.dst.sel = ctx->temp_reg;
6417 			alu.dst.chan = i;
6418 			alu.last = i == 1;
6419 
6420 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6421 			if (r)
6422 				return r;
6423 		}
6424 	}
6425 
6426 	tmp = r600_get_temp(ctx);
6427 	for (i = 0; i < 8; i++) {
6428 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6429 		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6430 
6431 		alu.dst.sel = tmp;
6432 		if ((i > 1 && i < 6)) {
6433 			alu.dst.write = 1;
6434 		}
6435 		else {
6436 			alu.dst.write = 0;
6437 		}
6438 		alu.dst.chan = i % 4;
6439 
6440 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6441 			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6442 			alu.src[0].sel = ctx->temp_reg;
6443 			alu.src[0].chan = 1 - (i % 2);
6444 		} else {
6445 			alu.src[0].sel = interp_gpr;
6446 			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6447 		}
6448 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6449 		alu.src[1].chan = 0;
6450 
6451 		alu.last = i % 4 == 3;
6452 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
6453 
6454 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6455 		if (r)
6456 			return r;
6457 	}
6458 
6459 	// INTERP can't swizzle dst
6460 	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6461 	for (i = 0; i <= lasti; i++) {
6462 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6463 			continue;
6464 
6465 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6466 		alu.op = ALU_OP1_MOV;
6467 		alu.src[0].sel = tmp;
6468 		alu.src[0].chan = ctx->src[0].swizzle[i];
6469 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6470 		alu.dst.write = 1;
6471 		alu.last = i == lasti;
6472 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6473 		if (r)
6474 			return r;
6475 	}
6476 
6477 	return 0;
6478 }
6479 
6480 
tgsi_helper_copy(struct r600_shader_ctx * ctx,struct tgsi_full_instruction * inst)6481 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6482 {
6483 	struct r600_bytecode_alu alu;
6484 	int i, r;
6485 
6486 	for (i = 0; i < 4; i++) {
6487 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6488 		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6489 			alu.op = ALU_OP0_NOP;
6490 			alu.dst.chan = i;
6491 		} else {
6492 			alu.op = ALU_OP1_MOV;
6493 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6494 			alu.src[0].sel = ctx->temp_reg;
6495 			alu.src[0].chan = i;
6496 		}
6497 		if (i == 3) {
6498 			alu.last = 1;
6499 		}
6500 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6501 		if (r)
6502 			return r;
6503 	}
6504 	return 0;
6505 }
6506 
tgsi_make_src_for_op3(struct r600_shader_ctx * ctx,unsigned temp,int chan,struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src)6507 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6508                                  unsigned temp, int chan,
6509                                  struct r600_bytecode_alu_src *bc_src,
6510                                  const struct r600_shader_src *shader_src)
6511 {
6512 	struct r600_bytecode_alu alu;
6513 	int r;
6514 
6515 	r600_bytecode_src(bc_src, shader_src, chan);
6516 
6517 	/* op3 operands don't support abs modifier */
6518 	if (bc_src->abs) {
6519 		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
6520 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6521 		alu.op = ALU_OP1_MOV;
6522 		alu.dst.sel = temp;
6523 		alu.dst.chan = chan;
6524 		alu.dst.write = 1;
6525 
6526 		alu.src[0] = *bc_src;
6527 		alu.last = true; // sufficient?
6528 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6529 		if (r)
6530 			return r;
6531 
6532 		memset(bc_src, 0, sizeof(*bc_src));
6533 		bc_src->sel = temp;
6534 		bc_src->chan = chan;
6535 	}
6536 	return 0;
6537 }
6538 
tgsi_op3(struct r600_shader_ctx * ctx)6539 static int tgsi_op3(struct r600_shader_ctx *ctx)
6540 {
6541 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6542 	struct r600_bytecode_alu alu;
6543 	int i, j, r;
6544 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6545 	int temp_regs[4];
6546 
6547 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6548 		temp_regs[j] = 0;
6549 		if (ctx->src[j].abs)
6550 			temp_regs[j] = r600_get_temp(ctx);
6551 	}
6552 	for (i = 0; i < lasti + 1; i++) {
6553 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6554 			continue;
6555 
6556 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6557 		alu.op = ctx->inst_info->op;
6558 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6559 			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6560 			if (r)
6561 				return r;
6562 		}
6563 
6564 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6565 		alu.dst.chan = i;
6566 		alu.dst.write = 1;
6567 		alu.is_op3 = 1;
6568 		if (i == lasti) {
6569 			alu.last = 1;
6570 		}
6571 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6572 		if (r)
6573 			return r;
6574 	}
6575 	return 0;
6576 }
6577 
tgsi_dp(struct r600_shader_ctx * ctx)6578 static int tgsi_dp(struct r600_shader_ctx *ctx)
6579 {
6580 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6581 	struct r600_bytecode_alu alu;
6582 	int i, j, r;
6583 
6584 	for (i = 0; i < 4; i++) {
6585 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6586 		alu.op = ctx->inst_info->op;
6587 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6588 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6589 		}
6590 
6591 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6592 		alu.dst.chan = i;
6593 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6594 		/* handle some special cases */
6595 		switch (inst->Instruction.Opcode) {
6596 		case TGSI_OPCODE_DP2:
6597 			if (i > 1) {
6598 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6599 				alu.src[0].chan = alu.src[1].chan = 0;
6600 			}
6601 			break;
6602 		case TGSI_OPCODE_DP3:
6603 			if (i > 2) {
6604 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6605 				alu.src[0].chan = alu.src[1].chan = 0;
6606 			}
6607 			break;
6608 		case TGSI_OPCODE_DPH:
6609 			if (i == 3) {
6610 				alu.src[0].sel = V_SQ_ALU_SRC_1;
6611 				alu.src[0].chan = 0;
6612 				alu.src[0].neg = 0;
6613 			}
6614 			break;
6615 		default:
6616 			break;
6617 		}
6618 		if (i == 3) {
6619 			alu.last = 1;
6620 		}
6621 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6622 		if (r)
6623 			return r;
6624 	}
6625 	return 0;
6626 }
6627 
tgsi_tex_src_requires_loading(struct r600_shader_ctx * ctx,unsigned index)6628 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6629 						    unsigned index)
6630 {
6631 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6632 	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6633 		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6634 		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6635 		ctx->src[index].neg || ctx->src[index].abs ||
6636 		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6637 }
6638 
tgsi_tex_get_src_gpr(struct r600_shader_ctx * ctx,unsigned index)6639 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6640 					unsigned index)
6641 {
6642 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6643 	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6644 }
6645 
do_vtx_fetch_inst(struct r600_shader_ctx * ctx,boolean src_requires_loading)6646 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6647 {
6648 	struct r600_bytecode_vtx vtx;
6649 	struct r600_bytecode_alu alu;
6650 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6651 	int src_gpr, r, i;
6652 	int id = tgsi_tex_get_src_gpr(ctx, 1);
6653 
6654 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6655 	if (src_requires_loading) {
6656 		for (i = 0; i < 4; i++) {
6657 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6658 			alu.op = ALU_OP1_MOV;
6659 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6660 			alu.dst.sel = ctx->temp_reg;
6661 			alu.dst.chan = i;
6662 			if (i == 3)
6663 				alu.last = 1;
6664 			alu.dst.write = 1;
6665 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6666 			if (r)
6667 				return r;
6668 		}
6669 		src_gpr = ctx->temp_reg;
6670 	}
6671 
6672 	memset(&vtx, 0, sizeof(vtx));
6673 	vtx.op = FETCH_OP_VFETCH;
6674 	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6675 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6676 	vtx.src_gpr = src_gpr;
6677 	vtx.mega_fetch_count = 16;
6678 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6679 	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
6680 	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
6681 	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
6682 	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
6683 	vtx.use_const_fields = 1;
6684 
6685 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6686 		return r;
6687 
6688 	if (ctx->bc->chip_class >= EVERGREEN)
6689 		return 0;
6690 
6691 	for (i = 0; i < 4; i++) {
6692 		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6693 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6694 			continue;
6695 
6696 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6697 		alu.op = ALU_OP2_AND_INT;
6698 
6699 		alu.dst.chan = i;
6700 		alu.dst.sel = vtx.dst_gpr;
6701 		alu.dst.write = 1;
6702 
6703 		alu.src[0].sel = vtx.dst_gpr;
6704 		alu.src[0].chan = i;
6705 
6706 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6707 		alu.src[1].sel += (id * 2);
6708 		alu.src[1].chan = i % 4;
6709 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6710 
6711 		if (i == lasti)
6712 			alu.last = 1;
6713 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6714 		if (r)
6715 			return r;
6716 	}
6717 
6718 	if (inst->Dst[0].Register.WriteMask & 3) {
6719 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6720 		alu.op = ALU_OP2_OR_INT;
6721 
6722 		alu.dst.chan = 3;
6723 		alu.dst.sel = vtx.dst_gpr;
6724 		alu.dst.write = 1;
6725 
6726 		alu.src[0].sel = vtx.dst_gpr;
6727 		alu.src[0].chan = 3;
6728 
6729 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6730 		alu.src[1].chan = 0;
6731 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6732 
6733 		alu.last = 1;
6734 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6735 		if (r)
6736 			return r;
6737 	}
6738 	return 0;
6739 }
6740 
r600_do_buffer_txq(struct r600_shader_ctx * ctx)6741 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
6742 {
6743 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6744 	struct r600_bytecode_alu alu;
6745 	int r;
6746 	int id = tgsi_tex_get_src_gpr(ctx, 1);
6747 
6748 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6749 	alu.op = ALU_OP1_MOV;
6750 	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6751 	if (ctx->bc->chip_class >= EVERGREEN) {
6752 		/* channel 0 or 2 of each word */
6753 		alu.src[0].sel += (id / 2);
6754 		alu.src[0].chan = (id % 2) * 2;
6755 	} else {
6756 		/* r600 we have them at channel 2 of the second dword */
6757 		alu.src[0].sel += (id * 2) + 1;
6758 		alu.src[0].chan = 1;
6759 	}
6760 	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6761 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6762 	alu.last = 1;
6763 	r = r600_bytecode_add_alu(ctx->bc, &alu);
6764 	if (r)
6765 		return r;
6766 	return 0;
6767 }
6768 
tgsi_tex(struct r600_shader_ctx * ctx)6769 static int tgsi_tex(struct r600_shader_ctx *ctx)
6770 {
6771 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6772 	struct r600_bytecode_tex tex;
6773 	struct r600_bytecode_alu alu;
6774 	unsigned src_gpr;
6775 	int r, i, j;
6776 	int opcode;
6777 	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6778 				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6779 				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6780 				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6781 
6782 	bool txf_add_offsets = inst->Texture.NumOffsets &&
6783 			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6784 			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6785 
6786 	/* Texture fetch instructions can only use gprs as source.
6787 	 * Also they cannot negate the source or take the absolute value */
6788 	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
6789 					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6790                                               tgsi_tex_src_requires_loading(ctx, 0)) ||
6791 					     read_compressed_msaa || txf_add_offsets;
6792 
6793 	boolean src_loaded = FALSE;
6794 	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
6795 	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6796 	boolean has_txq_cube_array_z = false;
6797 	unsigned sampler_index_mode;
6798 
6799 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6800 	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6801 	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6802 		if (inst->Dst[0].Register.WriteMask & 4) {
6803 			ctx->shader->has_txq_cube_array_z_comp = true;
6804 			has_txq_cube_array_z = true;
6805 		}
6806 
6807 	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6808 	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6809 	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6810 	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6811 		sampler_src_reg = 2;
6812 
6813 	/* TGSI moves the sampler to src reg 3 for TXD */
6814 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6815 		sampler_src_reg = 3;
6816 
6817 	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6818 
6819 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6820 
6821 	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6822 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6823 			ctx->shader->uses_tex_buffers = true;
6824 			return r600_do_buffer_txq(ctx);
6825 		}
6826 		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6827 			if (ctx->bc->chip_class < EVERGREEN)
6828 				ctx->shader->uses_tex_buffers = true;
6829 			return do_vtx_fetch_inst(ctx, src_requires_loading);
6830 		}
6831 	}
6832 
6833 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6834 		int out_chan;
6835 		/* Add perspective divide */
6836 		if (ctx->bc->chip_class == CAYMAN) {
6837 			out_chan = 2;
6838 			for (i = 0; i < 3; i++) {
6839 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6840 				alu.op = ALU_OP1_RECIP_IEEE;
6841 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6842 
6843 				alu.dst.sel = ctx->temp_reg;
6844 				alu.dst.chan = i;
6845 				if (i == 2)
6846 					alu.last = 1;
6847 				if (out_chan == i)
6848 					alu.dst.write = 1;
6849 				r = r600_bytecode_add_alu(ctx->bc, &alu);
6850 				if (r)
6851 					return r;
6852 			}
6853 
6854 		} else {
6855 			out_chan = 3;
6856 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6857 			alu.op = ALU_OP1_RECIP_IEEE;
6858 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6859 
6860 			alu.dst.sel = ctx->temp_reg;
6861 			alu.dst.chan = out_chan;
6862 			alu.last = 1;
6863 			alu.dst.write = 1;
6864 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6865 			if (r)
6866 				return r;
6867 		}
6868 
6869 		for (i = 0; i < 3; i++) {
6870 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6871 			alu.op = ALU_OP2_MUL;
6872 			alu.src[0].sel = ctx->temp_reg;
6873 			alu.src[0].chan = out_chan;
6874 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6875 			alu.dst.sel = ctx->temp_reg;
6876 			alu.dst.chan = i;
6877 			alu.dst.write = 1;
6878 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6879 			if (r)
6880 				return r;
6881 		}
6882 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6883 		alu.op = ALU_OP1_MOV;
6884 		alu.src[0].sel = V_SQ_ALU_SRC_1;
6885 		alu.src[0].chan = 0;
6886 		alu.dst.sel = ctx->temp_reg;
6887 		alu.dst.chan = 3;
6888 		alu.last = 1;
6889 		alu.dst.write = 1;
6890 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6891 		if (r)
6892 			return r;
6893 		src_loaded = TRUE;
6894 		src_gpr = ctx->temp_reg;
6895 	}
6896 
6897 
6898 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6899 	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6900 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6901 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6902 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
6903 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
6904 
6905 		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
6906 		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
6907 
6908 		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
6909 		for (i = 0; i < 4; i++) {
6910 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6911 			alu.op = ALU_OP2_CUBE;
6912 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6913 			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
6914 			alu.dst.sel = ctx->temp_reg;
6915 			alu.dst.chan = i;
6916 			if (i == 3)
6917 				alu.last = 1;
6918 			alu.dst.write = 1;
6919 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6920 			if (r)
6921 				return r;
6922 		}
6923 
6924 		/* tmp1.z = RCP_e(|tmp1.z|) */
6925 		if (ctx->bc->chip_class == CAYMAN) {
6926 			for (i = 0; i < 3; i++) {
6927 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6928 				alu.op = ALU_OP1_RECIP_IEEE;
6929 				alu.src[0].sel = ctx->temp_reg;
6930 				alu.src[0].chan = 2;
6931 				alu.src[0].abs = 1;
6932 				alu.dst.sel = ctx->temp_reg;
6933 				alu.dst.chan = i;
6934 				if (i == 2)
6935 					alu.dst.write = 1;
6936 				if (i == 2)
6937 					alu.last = 1;
6938 				r = r600_bytecode_add_alu(ctx->bc, &alu);
6939 				if (r)
6940 					return r;
6941 			}
6942 		} else {
6943 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6944 			alu.op = ALU_OP1_RECIP_IEEE;
6945 			alu.src[0].sel = ctx->temp_reg;
6946 			alu.src[0].chan = 2;
6947 			alu.src[0].abs = 1;
6948 			alu.dst.sel = ctx->temp_reg;
6949 			alu.dst.chan = 2;
6950 			alu.dst.write = 1;
6951 			alu.last = 1;
6952 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6953 			if (r)
6954 				return r;
6955 		}
6956 
6957 		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
6958 		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
6959 		 * muladd has no writemask, have to use another temp
6960 		 */
6961 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6962 		alu.op = ALU_OP3_MULADD;
6963 		alu.is_op3 = 1;
6964 
6965 		alu.src[0].sel = ctx->temp_reg;
6966 		alu.src[0].chan = 0;
6967 		alu.src[1].sel = ctx->temp_reg;
6968 		alu.src[1].chan = 2;
6969 
6970 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6971 		alu.src[2].chan = 0;
6972 		alu.src[2].value = u_bitcast_f2u(1.5f);
6973 
6974 		alu.dst.sel = ctx->temp_reg;
6975 		alu.dst.chan = 0;
6976 		alu.dst.write = 1;
6977 
6978 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6979 		if (r)
6980 			return r;
6981 
6982 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6983 		alu.op = ALU_OP3_MULADD;
6984 		alu.is_op3 = 1;
6985 
6986 		alu.src[0].sel = ctx->temp_reg;
6987 		alu.src[0].chan = 1;
6988 		alu.src[1].sel = ctx->temp_reg;
6989 		alu.src[1].chan = 2;
6990 
6991 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6992 		alu.src[2].chan = 0;
6993 		alu.src[2].value = u_bitcast_f2u(1.5f);
6994 
6995 		alu.dst.sel = ctx->temp_reg;
6996 		alu.dst.chan = 1;
6997 		alu.dst.write = 1;
6998 
6999 		alu.last = 1;
7000 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7001 		if (r)
7002 			return r;
7003 		/* write initial compare value into Z component
7004 		  - W src 0 for shadow cube
7005 		  - X src 1 for shadow cube array */
7006 		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7007 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7008 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7009 			alu.op = ALU_OP1_MOV;
7010 			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7011 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7012 			else
7013 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7014 			alu.dst.sel = ctx->temp_reg;
7015 			alu.dst.chan = 2;
7016 			alu.dst.write = 1;
7017 			alu.last = 1;
7018 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7019 			if (r)
7020 				return r;
7021 		}
7022 
7023 		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7024 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7025 			if (ctx->bc->chip_class >= EVERGREEN) {
7026 				int mytmp = r600_get_temp(ctx);
7027 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7028 				alu.op = ALU_OP1_MOV;
7029 				alu.src[0].sel = ctx->temp_reg;
7030 				alu.src[0].chan = 3;
7031 				alu.dst.sel = mytmp;
7032 				alu.dst.chan = 0;
7033 				alu.dst.write = 1;
7034 				alu.last = 1;
7035 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7036 				if (r)
7037 					return r;
7038 
7039 				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7040 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7041 				alu.op = ALU_OP3_MULADD;
7042 				alu.is_op3 = 1;
7043 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7044 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7045 				alu.src[1].chan = 0;
7046 				alu.src[1].value = u_bitcast_f2u(8.0f);
7047 				alu.src[2].sel = mytmp;
7048 				alu.src[2].chan = 0;
7049 				alu.dst.sel = ctx->temp_reg;
7050 				alu.dst.chan = 3;
7051 				alu.dst.write = 1;
7052 				alu.last = 1;
7053 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7054 				if (r)
7055 					return r;
7056 			} else if (ctx->bc->chip_class < EVERGREEN) {
7057 				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7058 				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7059 				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7060 				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7061 				tex.src_gpr = r600_get_temp(ctx);
7062 				tex.src_sel_x = 0;
7063 				tex.src_sel_y = 0;
7064 				tex.src_sel_z = 0;
7065 				tex.src_sel_w = 0;
7066 				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7067 				tex.coord_type_x = 1;
7068 				tex.coord_type_y = 1;
7069 				tex.coord_type_z = 1;
7070 				tex.coord_type_w = 1;
7071 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7072 				alu.op = ALU_OP1_MOV;
7073 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7074 				alu.dst.sel = tex.src_gpr;
7075 				alu.dst.chan = 0;
7076 				alu.last = 1;
7077 				alu.dst.write = 1;
7078 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7079 				if (r)
7080 					return r;
7081 
7082 				r = r600_bytecode_add_tex(ctx->bc, &tex);
7083 				if (r)
7084 					return r;
7085 			}
7086 
7087 		}
7088 
7089 		/* for cube forms of lod and bias we need to route things */
7090 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7091 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7092 		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7093 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7094 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7095 			alu.op = ALU_OP1_MOV;
7096 			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7097 			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7098 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7099 			else
7100 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7101 			alu.dst.sel = ctx->temp_reg;
7102 			alu.dst.chan = 2;
7103 			alu.last = 1;
7104 			alu.dst.write = 1;
7105 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7106 			if (r)
7107 				return r;
7108 		}
7109 
7110 		src_loaded = TRUE;
7111 		src_gpr = ctx->temp_reg;
7112 	}
7113 
7114 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7115 		int temp_h = 0, temp_v = 0;
7116 		int start_val = 0;
7117 
7118 		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7119 		if (src_loaded == TRUE)
7120 			start_val = 1;
7121 		else
7122 			src_loaded = TRUE;
7123 		for (i = start_val; i < 3; i++) {
7124 			int treg = r600_get_temp(ctx);
7125 
7126 			if (i == 0)
7127 				src_gpr = treg;
7128 			else if (i == 1)
7129 				temp_h = treg;
7130 			else
7131 				temp_v = treg;
7132 
7133 			for (j = 0; j < 4; j++) {
7134 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7135 				alu.op = ALU_OP1_MOV;
7136                                 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7137                                 alu.dst.sel = treg;
7138                                 alu.dst.chan = j;
7139                                 if (j == 3)
7140                                    alu.last = 1;
7141                                 alu.dst.write = 1;
7142                                 r = r600_bytecode_add_alu(ctx->bc, &alu);
7143                                 if (r)
7144                                     return r;
7145 			}
7146 		}
7147 		for (i = 1; i < 3; i++) {
7148 			/* set gradients h/v */
7149 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7150 			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7151 				FETCH_OP_SET_GRADIENTS_V;
7152 			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7153 			tex.sampler_index_mode = sampler_index_mode;
7154 			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7155 			tex.resource_index_mode = sampler_index_mode;
7156 
7157 			tex.src_gpr = (i == 1) ? temp_h : temp_v;
7158 			tex.src_sel_x = 0;
7159 			tex.src_sel_y = 1;
7160 			tex.src_sel_z = 2;
7161 			tex.src_sel_w = 3;
7162 
7163 			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7164 			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7165 			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7166 				tex.coord_type_x = 1;
7167 				tex.coord_type_y = 1;
7168 				tex.coord_type_z = 1;
7169 				tex.coord_type_w = 1;
7170 			}
7171 			r = r600_bytecode_add_tex(ctx->bc, &tex);
7172 			if (r)
7173 				return r;
7174 		}
7175 	}
7176 
7177 	if (src_requires_loading && !src_loaded) {
7178 		for (i = 0; i < 4; i++) {
7179 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7180 			alu.op = ALU_OP1_MOV;
7181 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7182 			alu.dst.sel = ctx->temp_reg;
7183 			alu.dst.chan = i;
7184 			if (i == 3)
7185 				alu.last = 1;
7186 			alu.dst.write = 1;
7187 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7188 			if (r)
7189 				return r;
7190 		}
7191 		src_loaded = TRUE;
7192 		src_gpr = ctx->temp_reg;
7193 	}
7194 
7195 	/* get offset values */
7196 	if (inst->Texture.NumOffsets) {
7197 		assert(inst->Texture.NumOffsets == 1);
7198 
7199 		/* The texture offset feature doesn't work with the TXF instruction
7200 		 * and must be emulated by adding the offset to the texture coordinates. */
7201 		if (txf_add_offsets) {
7202 			const struct tgsi_texture_offset *off = inst->TexOffsets;
7203 
7204 			switch (inst->Texture.Texture) {
7205 			case TGSI_TEXTURE_3D:
7206 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7207 				alu.op = ALU_OP2_ADD_INT;
7208 				alu.src[0].sel = src_gpr;
7209 				alu.src[0].chan = 2;
7210 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7211 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7212 				alu.dst.sel = src_gpr;
7213 				alu.dst.chan = 2;
7214 				alu.dst.write = 1;
7215 				alu.last = 1;
7216 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7217 				if (r)
7218 					return r;
7219 				/* fall through */
7220 
7221 			case TGSI_TEXTURE_2D:
7222 			case TGSI_TEXTURE_SHADOW2D:
7223 			case TGSI_TEXTURE_RECT:
7224 			case TGSI_TEXTURE_SHADOWRECT:
7225 			case TGSI_TEXTURE_2D_ARRAY:
7226 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7227 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7228 				alu.op = ALU_OP2_ADD_INT;
7229 				alu.src[0].sel = src_gpr;
7230 				alu.src[0].chan = 1;
7231 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7232 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7233 				alu.dst.sel = src_gpr;
7234 				alu.dst.chan = 1;
7235 				alu.dst.write = 1;
7236 				alu.last = 1;
7237 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7238 				if (r)
7239 					return r;
7240 				/* fall through */
7241 
7242 			case TGSI_TEXTURE_1D:
7243 			case TGSI_TEXTURE_SHADOW1D:
7244 			case TGSI_TEXTURE_1D_ARRAY:
7245 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7246 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7247 				alu.op = ALU_OP2_ADD_INT;
7248 				alu.src[0].sel = src_gpr;
7249 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7250 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7251 				alu.dst.sel = src_gpr;
7252 				alu.dst.write = 1;
7253 				alu.last = 1;
7254 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7255 				if (r)
7256 					return r;
7257 				break;
7258 				/* texture offsets do not apply to other texture targets */
7259 			}
7260 		} else {
7261 			switch (inst->Texture.Texture) {
7262 			case TGSI_TEXTURE_3D:
7263 				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7264 				/* fallthrough */
7265 			case TGSI_TEXTURE_2D:
7266 			case TGSI_TEXTURE_SHADOW2D:
7267 			case TGSI_TEXTURE_RECT:
7268 			case TGSI_TEXTURE_SHADOWRECT:
7269 			case TGSI_TEXTURE_2D_ARRAY:
7270 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7271 				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7272 				/* fallthrough */
7273 			case TGSI_TEXTURE_1D:
7274 			case TGSI_TEXTURE_SHADOW1D:
7275 			case TGSI_TEXTURE_1D_ARRAY:
7276 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7277 				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7278 			}
7279 		}
7280 	}
7281 
7282 	/* Obtain the sample index for reading a compressed MSAA color texture.
7283 	 * To read the FMASK, we use the ldfptr instruction, which tells us
7284 	 * where the samples are stored.
7285 	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7286 	 * which is the identity mapping. Each nibble says which physical sample
7287 	 * should be fetched to get that sample.
7288 	 *
7289 	 * Assume src.z contains the sample index. It should be modified like this:
7290 	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7291 	 * Then fetch the texel with src.
7292 	 */
7293 	if (read_compressed_msaa) {
7294 		unsigned sample_chan = 3;
7295 		unsigned temp = r600_get_temp(ctx);
7296 		assert(src_loaded);
7297 
7298 		/* temp.w = ldfptr() */
7299 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7300 		tex.op = FETCH_OP_LD;
7301 		tex.inst_mod = 1; /* to indicate this is ldfptr */
7302 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7303 		tex.sampler_index_mode = sampler_index_mode;
7304 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7305 		tex.resource_index_mode = sampler_index_mode;
7306 		tex.src_gpr = src_gpr;
7307 		tex.dst_gpr = temp;
7308 		tex.dst_sel_x = 7; /* mask out these components */
7309 		tex.dst_sel_y = 7;
7310 		tex.dst_sel_z = 7;
7311 		tex.dst_sel_w = 0; /* store X */
7312 		tex.src_sel_x = 0;
7313 		tex.src_sel_y = 1;
7314 		tex.src_sel_z = 2;
7315 		tex.src_sel_w = 3;
7316 		tex.offset_x = offset_x;
7317 		tex.offset_y = offset_y;
7318 		tex.offset_z = offset_z;
7319 		r = r600_bytecode_add_tex(ctx->bc, &tex);
7320 		if (r)
7321 			return r;
7322 
7323 		/* temp.x = sample_index*4 */
7324 		if (ctx->bc->chip_class == CAYMAN) {
7325 			for (i = 0 ; i < 4; i++) {
7326 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7327 				alu.op = ALU_OP2_MULLO_INT;
7328 				alu.src[0].sel = src_gpr;
7329 				alu.src[0].chan = sample_chan;
7330 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7331 				alu.src[1].value = 4;
7332 				alu.dst.sel = temp;
7333 				alu.dst.chan = i;
7334 				alu.dst.write = i == 0;
7335 				if (i == 3)
7336 					alu.last = 1;
7337 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7338 				if (r)
7339 					return r;
7340 			}
7341 		} else {
7342 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7343 			alu.op = ALU_OP2_MULLO_INT;
7344 			alu.src[0].sel = src_gpr;
7345 			alu.src[0].chan = sample_chan;
7346 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7347 			alu.src[1].value = 4;
7348 			alu.dst.sel = temp;
7349 			alu.dst.chan = 0;
7350 			alu.dst.write = 1;
7351 			alu.last = 1;
7352 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7353 			if (r)
7354 				return r;
7355 		}
7356 
7357 		/* sample_index = temp.w >> temp.x */
7358 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7359 		alu.op = ALU_OP2_LSHR_INT;
7360 		alu.src[0].sel = temp;
7361 		alu.src[0].chan = 3;
7362 		alu.src[1].sel = temp;
7363 		alu.src[1].chan = 0;
7364 		alu.dst.sel = src_gpr;
7365 		alu.dst.chan = sample_chan;
7366 		alu.dst.write = 1;
7367 		alu.last = 1;
7368 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7369 		if (r)
7370 			return r;
7371 
7372 		/* sample_index & 0xF */
7373 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7374 		alu.op = ALU_OP2_AND_INT;
7375 		alu.src[0].sel = src_gpr;
7376 		alu.src[0].chan = sample_chan;
7377 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7378 		alu.src[1].value = 0xF;
7379 		alu.dst.sel = src_gpr;
7380 		alu.dst.chan = sample_chan;
7381 		alu.dst.write = 1;
7382 		alu.last = 1;
7383 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7384 		if (r)
7385 			return r;
7386 #if 0
7387 		/* visualize the FMASK */
7388 		for (i = 0; i < 4; i++) {
7389 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7390 			alu.op = ALU_OP1_INT_TO_FLT;
7391 			alu.src[0].sel = src_gpr;
7392 			alu.src[0].chan = sample_chan;
7393 			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7394 			alu.dst.chan = i;
7395 			alu.dst.write = 1;
7396 			alu.last = 1;
7397 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7398 			if (r)
7399 				return r;
7400 		}
7401 		return 0;
7402 #endif
7403 	}
7404 
7405 	/* does this shader want a num layers from TXQ for a cube array? */
7406 	if (has_txq_cube_array_z) {
7407 		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7408 
7409 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7410 		alu.op = ALU_OP1_MOV;
7411 
7412 		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7413 		if (ctx->bc->chip_class >= EVERGREEN) {
7414 			/* channel 1 or 3 of each word */
7415 			alu.src[0].sel += (id / 2);
7416 			alu.src[0].chan = ((id % 2) * 2) + 1;
7417 		} else {
7418 			/* r600 we have them at channel 2 of the second dword */
7419 			alu.src[0].sel += (id * 2) + 1;
7420 			alu.src[0].chan = 2;
7421 		}
7422 		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7423 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7424 		alu.last = 1;
7425 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7426 		if (r)
7427 			return r;
7428 		/* disable writemask from texture instruction */
7429 		inst->Dst[0].Register.WriteMask &= ~4;
7430 	}
7431 
7432 	opcode = ctx->inst_info->op;
7433 	if (opcode == FETCH_OP_GATHER4 &&
7434 		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7435 		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7436 		opcode = FETCH_OP_GATHER4_O;
7437 
7438 		/* GATHER4_O/GATHER4_C_O use offset values loaded by
7439 		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
7440 		   encoded in the instruction are ignored. */
7441 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7442 		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7443 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7444 		tex.sampler_index_mode = sampler_index_mode;
7445 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7446 		tex.resource_index_mode = sampler_index_mode;
7447 
7448 		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7449 		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7450 		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7451 		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7452 		tex.src_sel_w = 4;
7453 
7454 		tex.dst_sel_x = 7;
7455 		tex.dst_sel_y = 7;
7456 		tex.dst_sel_z = 7;
7457 		tex.dst_sel_w = 7;
7458 
7459 		r = r600_bytecode_add_tex(ctx->bc, &tex);
7460 		if (r)
7461 			return r;
7462 	}
7463 
7464 	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7465 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7466 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7467 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7468 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7469 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7470 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7471 		switch (opcode) {
7472 		case FETCH_OP_SAMPLE:
7473 			opcode = FETCH_OP_SAMPLE_C;
7474 			break;
7475 		case FETCH_OP_SAMPLE_L:
7476 			opcode = FETCH_OP_SAMPLE_C_L;
7477 			break;
7478 		case FETCH_OP_SAMPLE_LB:
7479 			opcode = FETCH_OP_SAMPLE_C_LB;
7480 			break;
7481 		case FETCH_OP_SAMPLE_G:
7482 			opcode = FETCH_OP_SAMPLE_C_G;
7483 			break;
7484 		/* Texture gather variants */
7485 		case FETCH_OP_GATHER4:
7486 			opcode = FETCH_OP_GATHER4_C;
7487 			break;
7488 		case FETCH_OP_GATHER4_O:
7489 			opcode = FETCH_OP_GATHER4_C_O;
7490 			break;
7491 		}
7492 	}
7493 
7494 	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7495 	tex.op = opcode;
7496 
7497 	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7498 	tex.sampler_index_mode = sampler_index_mode;
7499 	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7500 	tex.resource_index_mode = sampler_index_mode;
7501 	tex.src_gpr = src_gpr;
7502 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7503 
7504 	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7505 		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7506 		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7507 	}
7508 
7509 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7510 		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7511 		tex.inst_mod = texture_component_select;
7512 
7513 		if (ctx->bc->chip_class == CAYMAN) {
7514 		/* GATHER4 result order is different from TGSI TG4 */
7515 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7516 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7517 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7518 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7519 		} else {
7520 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7521 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7522 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7523 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7524 		}
7525 	}
7526 	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7527 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7528 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7529 		tex.dst_sel_z = 7;
7530 		tex.dst_sel_w = 7;
7531 	}
7532 	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7533 		tex.dst_sel_x = 3;
7534 		tex.dst_sel_y = 7;
7535 		tex.dst_sel_z = 7;
7536 		tex.dst_sel_w = 7;
7537 	}
7538 	else {
7539 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7540 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7541 		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7542 		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7543 	}
7544 
7545 
7546 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
7547 	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7548 		tex.src_sel_x = 4;
7549 		tex.src_sel_y = 4;
7550 		tex.src_sel_z = 4;
7551 		tex.src_sel_w = 4;
7552 	} else if (src_loaded) {
7553 		tex.src_sel_x = 0;
7554 		tex.src_sel_y = 1;
7555 		tex.src_sel_z = 2;
7556 		tex.src_sel_w = 3;
7557 	} else {
7558 		tex.src_sel_x = ctx->src[0].swizzle[0];
7559 		tex.src_sel_y = ctx->src[0].swizzle[1];
7560 		tex.src_sel_z = ctx->src[0].swizzle[2];
7561 		tex.src_sel_w = ctx->src[0].swizzle[3];
7562 		tex.src_rel = ctx->src[0].rel;
7563 	}
7564 
7565 	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7566 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7567 	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7568 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7569 		tex.src_sel_x = 1;
7570 		tex.src_sel_y = 0;
7571 		tex.src_sel_z = 3;
7572 		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7573 	}
7574 
7575 	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7576 	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7577 		tex.coord_type_x = 1;
7578 		tex.coord_type_y = 1;
7579 	}
7580 	tex.coord_type_z = 1;
7581 	tex.coord_type_w = 1;
7582 
7583 	tex.offset_x = offset_x;
7584 	tex.offset_y = offset_y;
7585 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7586 		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7587 		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7588 		tex.offset_z = 0;
7589 	}
7590 	else {
7591 		tex.offset_z = offset_z;
7592 	}
7593 
7594 	/* Put the depth for comparison in W.
7595 	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7596 	 * Some instructions expect the depth in Z. */
7597 	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7598 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7599 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7600 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7601 	    opcode != FETCH_OP_SAMPLE_C_L &&
7602 	    opcode != FETCH_OP_SAMPLE_C_LB) {
7603 		tex.src_sel_w = tex.src_sel_z;
7604 	}
7605 
7606 	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7607 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7608 		if (opcode == FETCH_OP_SAMPLE_C_L ||
7609 		    opcode == FETCH_OP_SAMPLE_C_LB) {
7610 			/* the array index is read from Y */
7611 			tex.coord_type_y = 0;
7612 		} else {
7613 			/* the array index is read from Z */
7614 			tex.coord_type_z = 0;
7615 			tex.src_sel_z = tex.src_sel_y;
7616 		}
7617 	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7618 		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7619 		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7620 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7621 		    (ctx->bc->chip_class >= EVERGREEN)))
7622 		/* the array index is read from Z */
7623 		tex.coord_type_z = 0;
7624 
7625 	/* mask unused source components */
7626 	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7627 		switch (inst->Texture.Texture) {
7628 		case TGSI_TEXTURE_2D:
7629 		case TGSI_TEXTURE_RECT:
7630 			tex.src_sel_z = 7;
7631 			tex.src_sel_w = 7;
7632 			break;
7633 		case TGSI_TEXTURE_1D_ARRAY:
7634 			tex.src_sel_y = 7;
7635 			tex.src_sel_w = 7;
7636 			break;
7637 		case TGSI_TEXTURE_1D:
7638 			tex.src_sel_y = 7;
7639 			tex.src_sel_z = 7;
7640 			tex.src_sel_w = 7;
7641 			break;
7642 		}
7643 	}
7644 
7645 	r = r600_bytecode_add_tex(ctx->bc, &tex);
7646 	if (r)
7647 		return r;
7648 
7649 	/* add shadow ambient support  - gallium doesn't do it yet */
7650 	return 0;
7651 }
7652 
tgsi_lrp(struct r600_shader_ctx * ctx)7653 static int tgsi_lrp(struct r600_shader_ctx *ctx)
7654 {
7655 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7656 	struct r600_bytecode_alu alu;
7657 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7658 	unsigned i, temp_regs[2];
7659 	int r;
7660 
7661 	/* optimize if it's just an equal balance */
7662 	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
7663 		for (i = 0; i < lasti + 1; i++) {
7664 			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7665 				continue;
7666 
7667 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7668 			alu.op = ALU_OP2_ADD;
7669 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
7670 			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7671 			alu.omod = 3;
7672 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7673 			alu.dst.chan = i;
7674 			if (i == lasti) {
7675 				alu.last = 1;
7676 			}
7677 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7678 			if (r)
7679 				return r;
7680 		}
7681 		return 0;
7682 	}
7683 
7684 	/* 1 - src0 */
7685 	for (i = 0; i < lasti + 1; i++) {
7686 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7687 			continue;
7688 
7689 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7690 		alu.op = ALU_OP2_ADD;
7691 		alu.src[0].sel = V_SQ_ALU_SRC_1;
7692 		alu.src[0].chan = 0;
7693 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7694 		r600_bytecode_src_toggle_neg(&alu.src[1]);
7695 		alu.dst.sel = ctx->temp_reg;
7696 		alu.dst.chan = i;
7697 		if (i == lasti) {
7698 			alu.last = 1;
7699 		}
7700 		alu.dst.write = 1;
7701 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7702 		if (r)
7703 			return r;
7704 	}
7705 
7706 	/* (1 - src0) * src2 */
7707 	for (i = 0; i < lasti + 1; i++) {
7708 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7709 			continue;
7710 
7711 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7712 		alu.op = ALU_OP2_MUL;
7713 		alu.src[0].sel = ctx->temp_reg;
7714 		alu.src[0].chan = i;
7715 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7716 		alu.dst.sel = ctx->temp_reg;
7717 		alu.dst.chan = i;
7718 		if (i == lasti) {
7719 			alu.last = 1;
7720 		}
7721 		alu.dst.write = 1;
7722 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7723 		if (r)
7724 			return r;
7725 	}
7726 
7727 	/* src0 * src1 + (1 - src0) * src2 */
7728         if (ctx->src[0].abs)
7729 		temp_regs[0] = r600_get_temp(ctx);
7730 	else
7731 		temp_regs[0] = 0;
7732 	if (ctx->src[1].abs)
7733 		temp_regs[1] = r600_get_temp(ctx);
7734 	else
7735 		temp_regs[1] = 0;
7736 
7737 	for (i = 0; i < lasti + 1; i++) {
7738 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7739 			continue;
7740 
7741 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7742 		alu.op = ALU_OP3_MULADD;
7743 		alu.is_op3 = 1;
7744 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7745 		if (r)
7746 			return r;
7747 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
7748 		if (r)
7749 			return r;
7750 		alu.src[2].sel = ctx->temp_reg;
7751 		alu.src[2].chan = i;
7752 
7753 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7754 		alu.dst.chan = i;
7755 		if (i == lasti) {
7756 			alu.last = 1;
7757 		}
7758 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7759 		if (r)
7760 			return r;
7761 	}
7762 	return 0;
7763 }
7764 
tgsi_cmp(struct r600_shader_ctx * ctx)7765 static int tgsi_cmp(struct r600_shader_ctx *ctx)
7766 {
7767 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7768 	struct r600_bytecode_alu alu;
7769 	int i, r, j;
7770 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7771 	int temp_regs[3];
7772 	unsigned op;
7773 
7774 	if (ctx->src[0].abs && ctx->src[0].neg) {
7775 		op = ALU_OP3_CNDE;
7776 		ctx->src[0].abs = 0;
7777 		ctx->src[0].neg = 0;
7778 	} else {
7779 		op = ALU_OP3_CNDGE;
7780 	}
7781 
7782 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7783 		temp_regs[j] = 0;
7784 		if (ctx->src[j].abs)
7785 			temp_regs[j] = r600_get_temp(ctx);
7786 	}
7787 
7788 	for (i = 0; i < lasti + 1; i++) {
7789 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7790 			continue;
7791 
7792 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7793 		alu.op = op;
7794 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7795 		if (r)
7796 			return r;
7797 		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
7798 		if (r)
7799 			return r;
7800 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
7801 		if (r)
7802 			return r;
7803 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7804 		alu.dst.chan = i;
7805 		alu.dst.write = 1;
7806 		alu.is_op3 = 1;
7807 		if (i == lasti)
7808 			alu.last = 1;
7809 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7810 		if (r)
7811 			return r;
7812 	}
7813 	return 0;
7814 }
7815 
tgsi_ucmp(struct r600_shader_ctx * ctx)7816 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
7817 {
7818 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7819 	struct r600_bytecode_alu alu;
7820 	int i, r;
7821 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7822 
7823 	for (i = 0; i < lasti + 1; i++) {
7824 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7825 			continue;
7826 
7827 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7828 		alu.op = ALU_OP3_CNDE_INT;
7829 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7830 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7831 		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
7832 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7833 		alu.dst.chan = i;
7834 		alu.dst.write = 1;
7835 		alu.is_op3 = 1;
7836 		if (i == lasti)
7837 			alu.last = 1;
7838 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7839 		if (r)
7840 			return r;
7841 	}
7842 	return 0;
7843 }
7844 
tgsi_xpd(struct r600_shader_ctx * ctx)7845 static int tgsi_xpd(struct r600_shader_ctx *ctx)
7846 {
7847 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7848 	static const unsigned int src0_swizzle[] = {2, 0, 1};
7849 	static const unsigned int src1_swizzle[] = {1, 2, 0};
7850 	struct r600_bytecode_alu alu;
7851 	uint32_t use_temp = 0;
7852 	int i, r;
7853 
7854 	if (inst->Dst[0].Register.WriteMask != 0xf)
7855 		use_temp = 1;
7856 
7857 	for (i = 0; i < 4; i++) {
7858 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7859 		alu.op = ALU_OP2_MUL;
7860 		if (i < 3) {
7861 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7862 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
7863 		} else {
7864 			alu.src[0].sel = V_SQ_ALU_SRC_0;
7865 			alu.src[0].chan = i;
7866 			alu.src[1].sel = V_SQ_ALU_SRC_0;
7867 			alu.src[1].chan = i;
7868 		}
7869 
7870 		alu.dst.sel = ctx->temp_reg;
7871 		alu.dst.chan = i;
7872 		alu.dst.write = 1;
7873 
7874 		if (i == 3)
7875 			alu.last = 1;
7876 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7877 		if (r)
7878 			return r;
7879 	}
7880 
7881 	for (i = 0; i < 4; i++) {
7882 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7883 		alu.op = ALU_OP3_MULADD;
7884 
7885 		if (i < 3) {
7886 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
7887 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
7888 		} else {
7889 			alu.src[0].sel = V_SQ_ALU_SRC_0;
7890 			alu.src[0].chan = i;
7891 			alu.src[1].sel = V_SQ_ALU_SRC_0;
7892 			alu.src[1].chan = i;
7893 		}
7894 
7895 		alu.src[2].sel = ctx->temp_reg;
7896 		alu.src[2].neg = 1;
7897 		alu.src[2].chan = i;
7898 
7899 		if (use_temp)
7900 			alu.dst.sel = ctx->temp_reg;
7901 		else
7902 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7903 		alu.dst.chan = i;
7904 		alu.dst.write = 1;
7905 		alu.is_op3 = 1;
7906 		if (i == 3)
7907 			alu.last = 1;
7908 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7909 		if (r)
7910 			return r;
7911 	}
7912 	if (use_temp)
7913 		return tgsi_helper_copy(ctx, inst);
7914 	return 0;
7915 }
7916 
tgsi_exp(struct r600_shader_ctx * ctx)7917 static int tgsi_exp(struct r600_shader_ctx *ctx)
7918 {
7919 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7920 	struct r600_bytecode_alu alu;
7921 	int r;
7922 	unsigned i;
7923 
7924 	/* result.x = 2^floor(src); */
7925 	if (inst->Dst[0].Register.WriteMask & 1) {
7926 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7927 
7928 		alu.op = ALU_OP1_FLOOR;
7929 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7930 
7931 		alu.dst.sel = ctx->temp_reg;
7932 		alu.dst.chan = 0;
7933 		alu.dst.write = 1;
7934 		alu.last = 1;
7935 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7936 		if (r)
7937 			return r;
7938 
7939 		if (ctx->bc->chip_class == CAYMAN) {
7940 			for (i = 0; i < 3; i++) {
7941 				alu.op = ALU_OP1_EXP_IEEE;
7942 				alu.src[0].sel = ctx->temp_reg;
7943 				alu.src[0].chan = 0;
7944 
7945 				alu.dst.sel = ctx->temp_reg;
7946 				alu.dst.chan = i;
7947 				alu.dst.write = i == 0;
7948 				alu.last = i == 2;
7949 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7950 				if (r)
7951 					return r;
7952 			}
7953 		} else {
7954 			alu.op = ALU_OP1_EXP_IEEE;
7955 			alu.src[0].sel = ctx->temp_reg;
7956 			alu.src[0].chan = 0;
7957 
7958 			alu.dst.sel = ctx->temp_reg;
7959 			alu.dst.chan = 0;
7960 			alu.dst.write = 1;
7961 			alu.last = 1;
7962 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7963 			if (r)
7964 				return r;
7965 		}
7966 	}
7967 
7968 	/* result.y = tmp - floor(tmp); */
7969 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7970 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7971 
7972 		alu.op = ALU_OP1_FRACT;
7973 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7974 
7975 		alu.dst.sel = ctx->temp_reg;
7976 #if 0
7977 		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7978 		if (r)
7979 			return r;
7980 #endif
7981 		alu.dst.write = 1;
7982 		alu.dst.chan = 1;
7983 
7984 		alu.last = 1;
7985 
7986 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7987 		if (r)
7988 			return r;
7989 	}
7990 
7991 	/* result.z = RoughApprox2ToX(tmp);*/
7992 	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
7993 		if (ctx->bc->chip_class == CAYMAN) {
7994 			for (i = 0; i < 3; i++) {
7995 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7996 				alu.op = ALU_OP1_EXP_IEEE;
7997 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7998 
7999 				alu.dst.sel = ctx->temp_reg;
8000 				alu.dst.chan = i;
8001 				if (i == 2) {
8002 					alu.dst.write = 1;
8003 					alu.last = 1;
8004 				}
8005 
8006 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8007 				if (r)
8008 					return r;
8009 			}
8010 		} else {
8011 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8012 			alu.op = ALU_OP1_EXP_IEEE;
8013 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8014 
8015 			alu.dst.sel = ctx->temp_reg;
8016 			alu.dst.write = 1;
8017 			alu.dst.chan = 2;
8018 
8019 			alu.last = 1;
8020 
8021 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8022 			if (r)
8023 				return r;
8024 		}
8025 	}
8026 
8027 	/* result.w = 1.0;*/
8028 	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
8029 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8030 
8031 		alu.op = ALU_OP1_MOV;
8032 		alu.src[0].sel = V_SQ_ALU_SRC_1;
8033 		alu.src[0].chan = 0;
8034 
8035 		alu.dst.sel = ctx->temp_reg;
8036 		alu.dst.chan = 3;
8037 		alu.dst.write = 1;
8038 		alu.last = 1;
8039 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8040 		if (r)
8041 			return r;
8042 	}
8043 	return tgsi_helper_copy(ctx, inst);
8044 }
8045 
tgsi_log(struct r600_shader_ctx * ctx)8046 static int tgsi_log(struct r600_shader_ctx *ctx)
8047 {
8048 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8049 	struct r600_bytecode_alu alu;
8050 	int r;
8051 	unsigned i;
8052 
8053 	/* result.x = floor(log2(|src|)); */
8054 	if (inst->Dst[0].Register.WriteMask & 1) {
8055 		if (ctx->bc->chip_class == CAYMAN) {
8056 			for (i = 0; i < 3; i++) {
8057 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8058 
8059 				alu.op = ALU_OP1_LOG_IEEE;
8060 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8061 				r600_bytecode_src_set_abs(&alu.src[0]);
8062 
8063 				alu.dst.sel = ctx->temp_reg;
8064 				alu.dst.chan = i;
8065 				if (i == 0)
8066 					alu.dst.write = 1;
8067 				if (i == 2)
8068 					alu.last = 1;
8069 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8070 				if (r)
8071 					return r;
8072 			}
8073 
8074 		} else {
8075 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8076 
8077 			alu.op = ALU_OP1_LOG_IEEE;
8078 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8079 			r600_bytecode_src_set_abs(&alu.src[0]);
8080 
8081 			alu.dst.sel = ctx->temp_reg;
8082 			alu.dst.chan = 0;
8083 			alu.dst.write = 1;
8084 			alu.last = 1;
8085 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8086 			if (r)
8087 				return r;
8088 		}
8089 
8090 		alu.op = ALU_OP1_FLOOR;
8091 		alu.src[0].sel = ctx->temp_reg;
8092 		alu.src[0].chan = 0;
8093 
8094 		alu.dst.sel = ctx->temp_reg;
8095 		alu.dst.chan = 0;
8096 		alu.dst.write = 1;
8097 		alu.last = 1;
8098 
8099 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8100 		if (r)
8101 			return r;
8102 	}
8103 
8104 	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
8105 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8106 
8107 		if (ctx->bc->chip_class == CAYMAN) {
8108 			for (i = 0; i < 3; i++) {
8109 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8110 
8111 				alu.op = ALU_OP1_LOG_IEEE;
8112 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8113 				r600_bytecode_src_set_abs(&alu.src[0]);
8114 
8115 				alu.dst.sel = ctx->temp_reg;
8116 				alu.dst.chan = i;
8117 				if (i == 1)
8118 					alu.dst.write = 1;
8119 				if (i == 2)
8120 					alu.last = 1;
8121 
8122 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8123 				if (r)
8124 					return r;
8125 			}
8126 		} else {
8127 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8128 
8129 			alu.op = ALU_OP1_LOG_IEEE;
8130 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8131 			r600_bytecode_src_set_abs(&alu.src[0]);
8132 
8133 			alu.dst.sel = ctx->temp_reg;
8134 			alu.dst.chan = 1;
8135 			alu.dst.write = 1;
8136 			alu.last = 1;
8137 
8138 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8139 			if (r)
8140 				return r;
8141 		}
8142 
8143 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8144 
8145 		alu.op = ALU_OP1_FLOOR;
8146 		alu.src[0].sel = ctx->temp_reg;
8147 		alu.src[0].chan = 1;
8148 
8149 		alu.dst.sel = ctx->temp_reg;
8150 		alu.dst.chan = 1;
8151 		alu.dst.write = 1;
8152 		alu.last = 1;
8153 
8154 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8155 		if (r)
8156 			return r;
8157 
8158 		if (ctx->bc->chip_class == CAYMAN) {
8159 			for (i = 0; i < 3; i++) {
8160 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8161 				alu.op = ALU_OP1_EXP_IEEE;
8162 				alu.src[0].sel = ctx->temp_reg;
8163 				alu.src[0].chan = 1;
8164 
8165 				alu.dst.sel = ctx->temp_reg;
8166 				alu.dst.chan = i;
8167 				if (i == 1)
8168 					alu.dst.write = 1;
8169 				if (i == 2)
8170 					alu.last = 1;
8171 
8172 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8173 				if (r)
8174 					return r;
8175 			}
8176 		} else {
8177 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8178 			alu.op = ALU_OP1_EXP_IEEE;
8179 			alu.src[0].sel = ctx->temp_reg;
8180 			alu.src[0].chan = 1;
8181 
8182 			alu.dst.sel = ctx->temp_reg;
8183 			alu.dst.chan = 1;
8184 			alu.dst.write = 1;
8185 			alu.last = 1;
8186 
8187 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8188 			if (r)
8189 				return r;
8190 		}
8191 
8192 		if (ctx->bc->chip_class == CAYMAN) {
8193 			for (i = 0; i < 3; i++) {
8194 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8195 				alu.op = ALU_OP1_RECIP_IEEE;
8196 				alu.src[0].sel = ctx->temp_reg;
8197 				alu.src[0].chan = 1;
8198 
8199 				alu.dst.sel = ctx->temp_reg;
8200 				alu.dst.chan = i;
8201 				if (i == 1)
8202 					alu.dst.write = 1;
8203 				if (i == 2)
8204 					alu.last = 1;
8205 
8206 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8207 				if (r)
8208 					return r;
8209 			}
8210 		} else {
8211 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8212 			alu.op = ALU_OP1_RECIP_IEEE;
8213 			alu.src[0].sel = ctx->temp_reg;
8214 			alu.src[0].chan = 1;
8215 
8216 			alu.dst.sel = ctx->temp_reg;
8217 			alu.dst.chan = 1;
8218 			alu.dst.write = 1;
8219 			alu.last = 1;
8220 
8221 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8222 			if (r)
8223 				return r;
8224 		}
8225 
8226 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8227 
8228 		alu.op = ALU_OP2_MUL;
8229 
8230 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8231 		r600_bytecode_src_set_abs(&alu.src[0]);
8232 
8233 		alu.src[1].sel = ctx->temp_reg;
8234 		alu.src[1].chan = 1;
8235 
8236 		alu.dst.sel = ctx->temp_reg;
8237 		alu.dst.chan = 1;
8238 		alu.dst.write = 1;
8239 		alu.last = 1;
8240 
8241 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8242 		if (r)
8243 			return r;
8244 	}
8245 
8246 	/* result.z = log2(|src|);*/
8247 	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
8248 		if (ctx->bc->chip_class == CAYMAN) {
8249 			for (i = 0; i < 3; i++) {
8250 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8251 
8252 				alu.op = ALU_OP1_LOG_IEEE;
8253 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8254 				r600_bytecode_src_set_abs(&alu.src[0]);
8255 
8256 				alu.dst.sel = ctx->temp_reg;
8257 				if (i == 2)
8258 					alu.dst.write = 1;
8259 				alu.dst.chan = i;
8260 				if (i == 2)
8261 					alu.last = 1;
8262 
8263 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8264 				if (r)
8265 					return r;
8266 			}
8267 		} else {
8268 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8269 
8270 			alu.op = ALU_OP1_LOG_IEEE;
8271 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8272 			r600_bytecode_src_set_abs(&alu.src[0]);
8273 
8274 			alu.dst.sel = ctx->temp_reg;
8275 			alu.dst.write = 1;
8276 			alu.dst.chan = 2;
8277 			alu.last = 1;
8278 
8279 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8280 			if (r)
8281 				return r;
8282 		}
8283 	}
8284 
8285 	/* result.w = 1.0; */
8286 	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
8287 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8288 
8289 		alu.op = ALU_OP1_MOV;
8290 		alu.src[0].sel = V_SQ_ALU_SRC_1;
8291 		alu.src[0].chan = 0;
8292 
8293 		alu.dst.sel = ctx->temp_reg;
8294 		alu.dst.chan = 3;
8295 		alu.dst.write = 1;
8296 		alu.last = 1;
8297 
8298 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8299 		if (r)
8300 			return r;
8301 	}
8302 
8303 	return tgsi_helper_copy(ctx, inst);
8304 }
8305 
tgsi_eg_arl(struct r600_shader_ctx * ctx)8306 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
8307 {
8308 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8309 	struct r600_bytecode_alu alu;
8310 	int r;
8311 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8312 	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
8313 
8314 	assert(inst->Dst[0].Register.Index < 3);
8315 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8316 
8317 	switch (inst->Instruction.Opcode) {
8318 	case TGSI_OPCODE_ARL:
8319 		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
8320 		break;
8321 	case TGSI_OPCODE_ARR:
8322 		alu.op = ALU_OP1_FLT_TO_INT;
8323 		break;
8324 	case TGSI_OPCODE_UARL:
8325 		alu.op = ALU_OP1_MOV;
8326 		break;
8327 	default:
8328 		assert(0);
8329 		return -1;
8330 	}
8331 
8332 	for (i = 0; i <= lasti; ++i) {
8333 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8334 			continue;
8335 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8336 		alu.last = i == lasti;
8337 		alu.dst.sel = reg;
8338 	        alu.dst.chan = i;
8339 		alu.dst.write = 1;
8340 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8341 		if (r)
8342 			return r;
8343 	}
8344 
8345 	if (inst->Dst[0].Register.Index > 0)
8346 		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
8347 	else
8348 		ctx->bc->ar_loaded = 0;
8349 
8350 	return 0;
8351 }
tgsi_r600_arl(struct r600_shader_ctx * ctx)8352 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
8353 {
8354 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8355 	struct r600_bytecode_alu alu;
8356 	int r;
8357 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8358 
8359 	switch (inst->Instruction.Opcode) {
8360 	case TGSI_OPCODE_ARL:
8361 		memset(&alu, 0, sizeof(alu));
8362 		alu.op = ALU_OP1_FLOOR;
8363 		alu.dst.sel = ctx->bc->ar_reg;
8364 		alu.dst.write = 1;
8365 		for (i = 0; i <= lasti; ++i) {
8366 			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
8367 				alu.dst.chan = i;
8368 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8369 				alu.last = i == lasti;
8370 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8371 					return r;
8372 			}
8373 		}
8374 
8375 		memset(&alu, 0, sizeof(alu));
8376 		alu.op = ALU_OP1_FLT_TO_INT;
8377 		alu.src[0].sel = ctx->bc->ar_reg;
8378 		alu.dst.sel = ctx->bc->ar_reg;
8379 		alu.dst.write = 1;
8380 		/* FLT_TO_INT is trans-only on r600/r700 */
8381 		alu.last = TRUE;
8382 		for (i = 0; i <= lasti; ++i) {
8383 			alu.dst.chan = i;
8384 			alu.src[0].chan = i;
8385 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8386 				return r;
8387 		}
8388 		break;
8389 	case TGSI_OPCODE_ARR:
8390 		memset(&alu, 0, sizeof(alu));
8391 		alu.op = ALU_OP1_FLT_TO_INT;
8392 		alu.dst.sel = ctx->bc->ar_reg;
8393 		alu.dst.write = 1;
8394 		/* FLT_TO_INT is trans-only on r600/r700 */
8395 		alu.last = TRUE;
8396 		for (i = 0; i <= lasti; ++i) {
8397 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8398 				alu.dst.chan = i;
8399 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8400 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8401 					return r;
8402 			}
8403 		}
8404 		break;
8405 	case TGSI_OPCODE_UARL:
8406 		memset(&alu, 0, sizeof(alu));
8407 		alu.op = ALU_OP1_MOV;
8408 		alu.dst.sel = ctx->bc->ar_reg;
8409 		alu.dst.write = 1;
8410 		for (i = 0; i <= lasti; ++i) {
8411 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8412 				alu.dst.chan = i;
8413 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8414 				alu.last = i == lasti;
8415 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8416 					return r;
8417 			}
8418 		}
8419 		break;
8420 	default:
8421 		assert(0);
8422 		return -1;
8423 	}
8424 
8425 	ctx->bc->ar_loaded = 0;
8426 	return 0;
8427 }
8428 
tgsi_opdst(struct r600_shader_ctx * ctx)8429 static int tgsi_opdst(struct r600_shader_ctx *ctx)
8430 {
8431 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8432 	struct r600_bytecode_alu alu;
8433 	int i, r = 0;
8434 
8435 	for (i = 0; i < 4; i++) {
8436 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8437 
8438 		alu.op = ALU_OP2_MUL;
8439 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8440 
8441 		if (i == 0 || i == 3) {
8442 			alu.src[0].sel = V_SQ_ALU_SRC_1;
8443 		} else {
8444 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8445 		}
8446 
8447 		if (i == 0 || i == 2) {
8448 			alu.src[1].sel = V_SQ_ALU_SRC_1;
8449 		} else {
8450 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8451 		}
8452 		if (i == 3)
8453 			alu.last = 1;
8454 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8455 		if (r)
8456 			return r;
8457 	}
8458 	return 0;
8459 }
8460 
emit_logic_pred(struct r600_shader_ctx * ctx,int opcode,int alu_type)8461 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
8462 {
8463 	struct r600_bytecode_alu alu;
8464 	int r;
8465 
8466 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8467 	alu.op = opcode;
8468 	alu.execute_mask = 1;
8469 	alu.update_pred = 1;
8470 
8471 	alu.dst.sel = ctx->temp_reg;
8472 	alu.dst.write = 1;
8473 	alu.dst.chan = 0;
8474 
8475 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8476 	alu.src[1].sel = V_SQ_ALU_SRC_0;
8477 	alu.src[1].chan = 0;
8478 
8479 	alu.last = 1;
8480 
8481 	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
8482 	if (r)
8483 		return r;
8484 	return 0;
8485 }
8486 
pops(struct r600_shader_ctx * ctx,int pops)8487 static int pops(struct r600_shader_ctx *ctx, int pops)
8488 {
8489 	unsigned force_pop = ctx->bc->force_add_cf;
8490 
8491 	if (!force_pop) {
8492 		int alu_pop = 3;
8493 		if (ctx->bc->cf_last) {
8494 			if (ctx->bc->cf_last->op == CF_OP_ALU)
8495 				alu_pop = 0;
8496 			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
8497 				alu_pop = 1;
8498 		}
8499 		alu_pop += pops;
8500 		if (alu_pop == 1) {
8501 			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
8502 			ctx->bc->force_add_cf = 1;
8503 		} else if (alu_pop == 2) {
8504 			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
8505 			ctx->bc->force_add_cf = 1;
8506 		} else {
8507 			force_pop = 1;
8508 		}
8509 	}
8510 
8511 	if (force_pop) {
8512 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
8513 		ctx->bc->cf_last->pop_count = pops;
8514 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8515 	}
8516 
8517 	return 0;
8518 }
8519 
callstack_update_max_depth(struct r600_shader_ctx * ctx,unsigned reason)8520 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
8521                                               unsigned reason)
8522 {
8523 	struct r600_stack_info *stack = &ctx->bc->stack;
8524 	unsigned elements, entries;
8525 
8526 	unsigned entry_size = stack->entry_size;
8527 
8528 	elements = (stack->loop + stack->push_wqm ) * entry_size;
8529 	elements += stack->push;
8530 
8531 	switch (ctx->bc->chip_class) {
8532 	case R600:
8533 	case R700:
8534 		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
8535 		 * the stack must be reserved to hold the current active/continue
8536 		 * masks */
8537 		if (reason == FC_PUSH_VPM) {
8538 			elements += 2;
8539 		}
8540 		break;
8541 
8542 	case CAYMAN:
8543 		/* r9xx: any stack operation on empty stack consumes 2 additional
8544 		 * elements */
8545 		elements += 2;
8546 
8547 		/* fallthrough */
8548 		/* FIXME: do the two elements added above cover the cases for the
8549 		 * r8xx+ below? */
8550 
8551 	case EVERGREEN:
8552 		/* r8xx+: 2 extra elements are not always required, but one extra
8553 		 * element must be added for each of the following cases:
8554 		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
8555 		 *    stack usage.
8556 		 *    (Currently we don't use ALU_ELSE_AFTER.)
8557 		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
8558 		 *    PUSH instruction executed.
8559 		 *
8560 		 *    NOTE: it seems we also need to reserve additional element in some
8561 		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
8562 		 *    then STACK_SIZE should be 2 instead of 1 */
8563 		if (reason == FC_PUSH_VPM) {
8564 			elements += 1;
8565 		}
8566 		break;
8567 
8568 	default:
8569 		assert(0);
8570 		break;
8571 	}
8572 
8573 	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
8574 	 * for all chips, so we use 4 in the final formula, not the real entry_size
8575 	 * for the chip */
8576 	entry_size = 4;
8577 
8578 	entries = (elements + (entry_size - 1)) / entry_size;
8579 
8580 	if (entries > stack->max_entries)
8581 		stack->max_entries = entries;
8582 }
8583 
callstack_pop(struct r600_shader_ctx * ctx,unsigned reason)8584 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
8585 {
8586 	switch(reason) {
8587 	case FC_PUSH_VPM:
8588 		--ctx->bc->stack.push;
8589 		assert(ctx->bc->stack.push >= 0);
8590 		break;
8591 	case FC_PUSH_WQM:
8592 		--ctx->bc->stack.push_wqm;
8593 		assert(ctx->bc->stack.push_wqm >= 0);
8594 		break;
8595 	case FC_LOOP:
8596 		--ctx->bc->stack.loop;
8597 		assert(ctx->bc->stack.loop >= 0);
8598 		break;
8599 	default:
8600 		assert(0);
8601 		break;
8602 	}
8603 }
8604 
callstack_push(struct r600_shader_ctx * ctx,unsigned reason)8605 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
8606 {
8607 	switch (reason) {
8608 	case FC_PUSH_VPM:
8609 		++ctx->bc->stack.push;
8610 		break;
8611 	case FC_PUSH_WQM:
8612 		++ctx->bc->stack.push_wqm;
8613 	case FC_LOOP:
8614 		++ctx->bc->stack.loop;
8615 		break;
8616 	default:
8617 		assert(0);
8618 	}
8619 
8620 	callstack_update_max_depth(ctx, reason);
8621 }
8622 
fc_set_mid(struct r600_shader_ctx * ctx,int fc_sp)8623 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
8624 {
8625 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
8626 
8627 	sp->mid = realloc((void *)sp->mid,
8628 						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
8629 	sp->mid[sp->num_mid] = ctx->bc->cf_last;
8630 	sp->num_mid++;
8631 }
8632 
fc_pushlevel(struct r600_shader_ctx * ctx,int type)8633 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
8634 {
8635 	ctx->bc->fc_sp++;
8636 	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
8637 	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
8638 }
8639 
fc_poplevel(struct r600_shader_ctx * ctx)8640 static void fc_poplevel(struct r600_shader_ctx *ctx)
8641 {
8642 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
8643 	free(sp->mid);
8644 	sp->mid = NULL;
8645 	sp->num_mid = 0;
8646 	sp->start = NULL;
8647 	sp->type = 0;
8648 	ctx->bc->fc_sp--;
8649 }
8650 
8651 #if 0
8652 static int emit_return(struct r600_shader_ctx *ctx)
8653 {
8654 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
8655 	return 0;
8656 }
8657 
8658 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
8659 {
8660 
8661 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
8662 	ctx->bc->cf_last->pop_count = pops;
8663 	/* XXX work out offset */
8664 	return 0;
8665 }
8666 
8667 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
8668 {
8669 	return 0;
8670 }
8671 
8672 static void emit_testflag(struct r600_shader_ctx *ctx)
8673 {
8674 
8675 }
8676 
8677 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
8678 {
8679 	emit_testflag(ctx);
8680 	emit_jump_to_offset(ctx, 1, 4);
8681 	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
8682 	pops(ctx, ifidx + 1);
8683 	emit_return(ctx);
8684 }
8685 
8686 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
8687 {
8688 	emit_testflag(ctx);
8689 
8690 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8691 	ctx->bc->cf_last->pop_count = 1;
8692 
8693 	fc_set_mid(ctx, fc_sp);
8694 
8695 	pops(ctx, 1);
8696 }
8697 #endif
8698 
emit_if(struct r600_shader_ctx * ctx,int opcode)8699 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
8700 {
8701 	int alu_type = CF_OP_ALU_PUSH_BEFORE;
8702 
8703 	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
8704 	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
8705 	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
8706 	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
8707 	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
8708 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
8709 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8710 		alu_type = CF_OP_ALU;
8711 	}
8712 
8713 	emit_logic_pred(ctx, opcode, alu_type);
8714 
8715 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
8716 
8717 	fc_pushlevel(ctx, FC_IF);
8718 
8719 	callstack_push(ctx, FC_PUSH_VPM);
8720 	return 0;
8721 }
8722 
tgsi_if(struct r600_shader_ctx * ctx)8723 static int tgsi_if(struct r600_shader_ctx *ctx)
8724 {
8725 	return emit_if(ctx, ALU_OP2_PRED_SETNE);
8726 }
8727 
tgsi_uif(struct r600_shader_ctx * ctx)8728 static int tgsi_uif(struct r600_shader_ctx *ctx)
8729 {
8730 	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
8731 }
8732 
tgsi_else(struct r600_shader_ctx * ctx)8733 static int tgsi_else(struct r600_shader_ctx *ctx)
8734 {
8735 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
8736 	ctx->bc->cf_last->pop_count = 1;
8737 
8738 	fc_set_mid(ctx, ctx->bc->fc_sp);
8739 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
8740 	return 0;
8741 }
8742 
tgsi_endif(struct r600_shader_ctx * ctx)8743 static int tgsi_endif(struct r600_shader_ctx *ctx)
8744 {
8745 	pops(ctx, 1);
8746 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
8747 		R600_ERR("if/endif unbalanced in shader\n");
8748 		return -1;
8749 	}
8750 
8751 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
8752 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8753 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
8754 	} else {
8755 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
8756 	}
8757 	fc_poplevel(ctx);
8758 
8759 	callstack_pop(ctx, FC_PUSH_VPM);
8760 	return 0;
8761 }
8762 
tgsi_bgnloop(struct r600_shader_ctx * ctx)8763 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
8764 {
8765 	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
8766 	 * limited to 4096 iterations, like the other LOOP_* instructions. */
8767 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
8768 
8769 	fc_pushlevel(ctx, FC_LOOP);
8770 
8771 	/* check stack depth */
8772 	callstack_push(ctx, FC_LOOP);
8773 	return 0;
8774 }
8775 
tgsi_endloop(struct r600_shader_ctx * ctx)8776 static int tgsi_endloop(struct r600_shader_ctx *ctx)
8777 {
8778 	unsigned i;
8779 
8780 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
8781 
8782 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
8783 		R600_ERR("loop/endloop in shader code are not paired.\n");
8784 		return -EINVAL;
8785 	}
8786 
8787 	/* fixup loop pointers - from r600isa
8788 	   LOOP END points to CF after LOOP START,
8789 	   LOOP START point to CF after LOOP END
8790 	   BRK/CONT point to LOOP END CF
8791 	*/
8792 	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
8793 
8794 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8795 
8796 	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
8797 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
8798 	}
8799 	/* XXX add LOOPRET support */
8800 	fc_poplevel(ctx);
8801 	callstack_pop(ctx, FC_LOOP);
8802 	return 0;
8803 }
8804 
tgsi_loop_breakc(struct r600_shader_ctx * ctx)8805 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
8806 {
8807 	int r;
8808 	unsigned int fscp;
8809 
8810 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8811 	{
8812 		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8813 			break;
8814 	}
8815 	if (fscp == 0) {
8816 		R600_ERR("BREAKC not inside loop/endloop pair\n");
8817 		return -EINVAL;
8818 	}
8819 
8820 	if (ctx->bc->chip_class == EVERGREEN &&
8821 	    ctx->bc->family != CHIP_CYPRESS &&
8822 	    ctx->bc->family != CHIP_JUNIPER) {
8823 		/* HW bug: ALU_BREAK does not save the active mask correctly */
8824 		r = tgsi_uif(ctx);
8825 		if (r)
8826 			return r;
8827 
8828 		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
8829 		if (r)
8830 			return r;
8831 		fc_set_mid(ctx, fscp);
8832 
8833 		return tgsi_endif(ctx);
8834 	} else {
8835 		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
8836 		if (r)
8837 			return r;
8838 		fc_set_mid(ctx, fscp);
8839 	}
8840 
8841 	return 0;
8842 }
8843 
tgsi_loop_brk_cont(struct r600_shader_ctx * ctx)8844 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
8845 {
8846 	unsigned int fscp;
8847 
8848 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8849 	{
8850 		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8851 			break;
8852 	}
8853 
8854 	if (fscp == 0) {
8855 		R600_ERR("Break not inside loop/endloop pair\n");
8856 		return -EINVAL;
8857 	}
8858 
8859 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8860 
8861 	fc_set_mid(ctx, fscp);
8862 
8863 	return 0;
8864 }
8865 
tgsi_gs_emit(struct r600_shader_ctx * ctx)8866 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
8867 {
8868 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8869 	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
8870 	int r;
8871 
8872 	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
8873 		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
8874 
8875 	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8876 	if (!r) {
8877 		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
8878 		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
8879 			return emit_inc_ring_offset(ctx, stream, TRUE);
8880 	}
8881 	return r;
8882 }
8883 
tgsi_umad(struct r600_shader_ctx * ctx)8884 static int tgsi_umad(struct r600_shader_ctx *ctx)
8885 {
8886 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8887 	struct r600_bytecode_alu alu;
8888 	int i, j, k, r;
8889 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8890 
8891 	/* src0 * src1 */
8892 	for (i = 0; i < lasti + 1; i++) {
8893 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8894 			continue;
8895 
8896 		if (ctx->bc->chip_class == CAYMAN) {
8897 			for (j = 0 ; j < 4; j++) {
8898 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8899 
8900 				alu.op = ALU_OP2_MULLO_UINT;
8901 				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
8902 					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
8903 				}
8904 				alu.dst.chan = j;
8905 				alu.dst.sel = ctx->temp_reg;
8906 				alu.dst.write = (j == i);
8907 				if (j == 3)
8908 					alu.last = 1;
8909 				r = r600_bytecode_add_alu(ctx->bc, &alu);
8910 				if (r)
8911 					return r;
8912 			}
8913 		} else {
8914 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8915 
8916 			alu.dst.chan = i;
8917 			alu.dst.sel = ctx->temp_reg;
8918 			alu.dst.write = 1;
8919 
8920 			alu.op = ALU_OP2_MULLO_UINT;
8921 			for (j = 0; j < 2; j++) {
8922 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
8923 			}
8924 
8925 			alu.last = 1;
8926 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8927 			if (r)
8928 				return r;
8929 		}
8930 	}
8931 
8932 
8933 	for (i = 0; i < lasti + 1; i++) {
8934 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8935 			continue;
8936 
8937 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8938 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8939 
8940 		alu.op = ALU_OP2_ADD_INT;
8941 
8942 		alu.src[0].sel = ctx->temp_reg;
8943 		alu.src[0].chan = i;
8944 
8945 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8946 		if (i == lasti) {
8947 			alu.last = 1;
8948 		}
8949 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8950 		if (r)
8951 			return r;
8952 	}
8953 	return 0;
8954 }
8955 
tgsi_pk2h(struct r600_shader_ctx * ctx)8956 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
8957 {
8958 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8959 	struct r600_bytecode_alu alu;
8960 	int r, i;
8961 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8962 
8963 	/* temp.xy = f32_to_f16(src) */
8964 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8965 	alu.op = ALU_OP1_FLT32_TO_FLT16;
8966 	alu.dst.chan = 0;
8967 	alu.dst.sel = ctx->temp_reg;
8968 	alu.dst.write = 1;
8969 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8970 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8971 	if (r)
8972 		return r;
8973 	alu.dst.chan = 1;
8974 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
8975 	alu.last = 1;
8976 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8977 	if (r)
8978 		return r;
8979 
8980 	/* dst.x = temp.y * 0x10000 + temp.x */
8981 	for (i = 0; i < lasti + 1; i++) {
8982 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8983 			continue;
8984 
8985 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8986 		alu.op = ALU_OP3_MULADD_UINT24;
8987 		alu.is_op3 = 1;
8988 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8989 		alu.last = i == lasti;
8990 		alu.src[0].sel = ctx->temp_reg;
8991 		alu.src[0].chan = 1;
8992 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8993 		alu.src[1].value = 0x10000;
8994 		alu.src[2].sel = ctx->temp_reg;
8995 		alu.src[2].chan = 0;
8996 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8997 		if (r)
8998 			return r;
8999 	}
9000 
9001 	return 0;
9002 }
9003 
tgsi_up2h(struct r600_shader_ctx * ctx)9004 static int tgsi_up2h(struct r600_shader_ctx *ctx)
9005 {
9006 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9007 	struct r600_bytecode_alu alu;
9008 	int r, i;
9009 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9010 
9011 	/* temp.x = src.x */
9012 	/* note: no need to mask out the high bits */
9013 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9014 	alu.op = ALU_OP1_MOV;
9015 	alu.dst.chan = 0;
9016 	alu.dst.sel = ctx->temp_reg;
9017 	alu.dst.write = 1;
9018 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9019 	r = r600_bytecode_add_alu(ctx->bc, &alu);
9020 	if (r)
9021 		return r;
9022 
9023 	/* temp.y = src.x >> 16 */
9024 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9025 	alu.op = ALU_OP2_LSHR_INT;
9026 	alu.dst.chan = 1;
9027 	alu.dst.sel = ctx->temp_reg;
9028 	alu.dst.write = 1;
9029 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9030 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9031 	alu.src[1].value = 16;
9032 	alu.last = 1;
9033 	r = r600_bytecode_add_alu(ctx->bc, &alu);
9034 	if (r)
9035 		return r;
9036 
9037 	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
9038 	for (i = 0; i < lasti + 1; i++) {
9039 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9040 			continue;
9041 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9042 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9043 		alu.op = ALU_OP1_FLT16_TO_FLT32;
9044 		alu.src[0].sel = ctx->temp_reg;
9045 		alu.src[0].chan = i % 2;
9046 		alu.last = i == lasti;
9047 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9048 		if (r)
9049 			return r;
9050 	}
9051 
9052 	return 0;
9053 }
9054 
9055 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
9056 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
9057 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9058 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9059 
9060 	/* XXX:
9061 	 * For state trackers other than OpenGL, we'll want to use
9062 	 * _RECIP_IEEE instead.
9063 	 */
9064 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
9065 
9066 	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
9067 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9068 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9069 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9070 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9071 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9072 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9073 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9074 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9075 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9076 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9077 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9078 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9079 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9080 	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
9081 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9082 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9083 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9084 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9085 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9086 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9087 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9088 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9089 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9090 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9091 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
9092 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9093 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9094 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
9095 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9096 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9097 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
9098 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9099 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9100 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9101 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9102 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9103 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9104 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9105 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9106 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9107 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9108 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9109 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
9110 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9111 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9112 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9113 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9114 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9115 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9116 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9117 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9118 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9119 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9120 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9121 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9122 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
9123 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9124 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9125 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9126 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9127 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9128 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9129 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9130 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9131 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9132 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9133 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9134 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9135 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9136 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9137 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9138 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9139 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9140 	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
9141 	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
9142 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9143 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9144 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9145 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9146 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9147 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9148 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
9149 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9150 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9151 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9152 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9153 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9154 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9155 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9156 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9157 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9158 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9159 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9160 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9161 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9162 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9163 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9164 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9165 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9166 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
9167 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9168 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9169 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9170 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9171 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9172 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9173 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
9174 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9175 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9176 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
9177 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9178 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9179 	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
9180 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
9181 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9182 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9183 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9184 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9185 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9186 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
9187 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9188 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
9189 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9190 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9191 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9192 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9193 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9194 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9195 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9196 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9197 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9198 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9199 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
9200 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9201 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
9202 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9203 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9204 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9205 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9206 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9207 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9208 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9209 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9210 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9211 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9212 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9213 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9214 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9215 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9216 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9217 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9218 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
9219 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9220 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9221 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9222 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9223 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9224 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9225 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9226 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9227 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9228 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9229 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9230 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9231 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9232 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9233 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9234 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9235 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9236 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9237 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9238 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9239 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9240 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9241 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9242 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9243 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
9244 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
9245 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
9246 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
9247 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
9248 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
9249 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
9250 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
9251 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
9252 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
9253 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
9254 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
9255 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
9256 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9257 };
9258 
9259 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
9260 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
9261 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9262 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9263 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9264 	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
9265 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9266 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9267 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9268 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9269 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9270 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9271 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9272 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9273 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9274 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9275 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9276 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9277 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9278 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
9279 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9280 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9281 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9282 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9283 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9284 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9285 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9286 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9287 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9288 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9289 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
9290 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9291 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9292 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
9293 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9294 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9295 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
9296 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9297 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9298 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9299 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
9300 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9301 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9302 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9303 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9304 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9305 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9306 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9307 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
9308 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9309 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9310 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9311 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9312 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9313 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9314 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
9315 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9316 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9317 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9318 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9319 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9320 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9321 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9322 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9323 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9324 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9325 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9326 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9327 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9328 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9329 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9330 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9331 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9332 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9333 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9334 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9335 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9336 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9337 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9338 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9339 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9340 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9341 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9342 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9343 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9344 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9345 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9346 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9347 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9348 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9349 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9350 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9351 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9352 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9353 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9354 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9355 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9356 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9357 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9358 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9359 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9360 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9361 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9362 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9363 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9364 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
9365 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9366 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9367 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9368 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9369 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9370 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9371 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
9372 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9373 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9374 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9375 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9376 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9377 	/* Refer below for TGSI_OPCODE_DFMA */
9378 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
9379 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9380 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9381 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9382 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9383 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9384 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9385 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9386 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
9387 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9388 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9389 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9390 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9391 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9392 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9393 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9394 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9395 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9396 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9397 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9398 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9399 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9400 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9401 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9402 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9403 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9404 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9405 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9406 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9407 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9408 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9409 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9410 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9411 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9412 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9413 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9414 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9415 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9416 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9417 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9418 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9419 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9420 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9421 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9422 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9423 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9424 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9425 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
9426 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9427 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9428 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9429 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9430 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9431 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9432 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9433 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9434 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9435 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9436 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9437 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9438 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9439 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9440 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9441 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9442 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9443 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9444 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9445 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9446 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9447 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9448 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9449 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9450 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9451 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9452 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9453 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9454 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9455 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9456 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9457 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9458 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9459 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9460 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
9461 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9462 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9463 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9464 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9465 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9466 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9467 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9468 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9469 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9470 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9471 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9472 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9473 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9474 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9475 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9476 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9477 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9478 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9479 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9480 };
9481 
9482 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
9483 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
9484 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9485 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9486 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
9487 	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
9488 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9489 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9490 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9491 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9492 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9493 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9494 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9495 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9496 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9497 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9498 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9499 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9500 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9501 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
9502 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
9503 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9504 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9505 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9506 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9507 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9508 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9509 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9510 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
9511 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
9512 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
9513 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9514 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9515 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
9516 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9517 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9518 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
9519 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9520 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9521 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9522 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
9523 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9524 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9525 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9526 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9527 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9528 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9529 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9530 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
9531 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9532 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9533 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9534 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9535 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9536 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9537 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
9538 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9539 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9540 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9541 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9542 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9543 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9544 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9545 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9546 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9547 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9548 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9549 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9550 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9551 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9552 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9553 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9554 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9555 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9556 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9557 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9558 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9559 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9560 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9561 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9562 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9563 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9564 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9565 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9566 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
9567 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9568 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9569 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9570 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9571 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9572 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9573 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9574 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9575 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9576 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9577 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9578 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9579 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9580 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9581 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9582 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9583 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9584 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9585 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9586 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9587 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
9588 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9589 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9590 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9591 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9592 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9593 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9594 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
9595 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9596 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9597 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9598 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9599 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9600 	/* Refer below for TGSI_OPCODE_DFMA */
9601 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
9602 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9603 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9604 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9605 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9606 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9607 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9608 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9609 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
9610 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
9611 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9612 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9613 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9614 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9615 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9616 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9617 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
9618 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9619 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9620 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9621 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9622 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9623 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9624 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9625 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9626 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9627 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9628 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9629 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9630 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9631 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9632 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9633 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9634 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9635 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9636 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9637 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9638 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9639 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9640 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9641 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9642 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9643 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9644 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9645 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9646 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9647 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9648 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
9649 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9650 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9651 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9652 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9653 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9654 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9655 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9656 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9657 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9658 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9659 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9660 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9661 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9662 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
9663 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
9664 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9665 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9666 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9667 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9668 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9669 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9670 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9671 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9672 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9673 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9674 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9675 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9676 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9677 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9678 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9679 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9680 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9681 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9682 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9683 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
9684 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9685 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9686 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9687 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9688 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9689 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9690 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9691 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9692 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9693 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9694 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9695 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9696 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9697 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9698 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9699 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9700 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9701 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9702 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9703 };
9704